summaryrefslogtreecommitdiffstats
path: root/lib/Target/X86
diff options
context:
space:
mode:
authored <ed@FreeBSD.org>2009-06-02 17:52:33 +0000
committered <ed@FreeBSD.org>2009-06-02 17:52:33 +0000
commit3277b69d734b9c90b44ebde4ede005717e2c3b2e (patch)
tree64ba909838c23261cace781ece27d106134ea451 /lib/Target/X86
downloadFreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.zip
FreeBSD-src-3277b69d734b9c90b44ebde4ede005717e2c3b2e.tar.gz
Import LLVM, at r72732.
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmPrinter/CMakeLists.txt11
-rw-r--r--lib/Target/X86/AsmPrinter/Makefile15
-rw-r--r--lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.cpp1075
-rw-r--r--lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.h164
-rw-r--r--lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp50
-rw-r--r--lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.cpp609
-rw-r--r--lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.h152
-rw-r--r--lib/Target/X86/CMakeLists.txt29
-rw-r--r--lib/Target/X86/Makefile23
-rw-r--r--lib/Target/X86/README-FPStack.txt85
-rw-r--r--lib/Target/X86/README-MMX.txt71
-rw-r--r--lib/Target/X86/README-SSE.txt918
-rw-r--r--lib/Target/X86/README-UNIMPLEMENTED.txt14
-rw-r--r--lib/Target/X86/README-X86-64.txt251
-rw-r--r--lib/Target/X86/README.txt1899
-rw-r--r--lib/Target/X86/X86.h84
-rw-r--r--lib/Target/X86/X86.td184
-rw-r--r--lib/Target/X86/X86COFF.h95
-rw-r--r--lib/Target/X86/X86CallingConv.td360
-rw-r--r--lib/Target/X86/X86CodeEmitter.cpp811
-rw-r--r--lib/Target/X86/X86CompilationCallback_Win64.asm67
-rw-r--r--lib/Target/X86/X86ELFWriterInfo.cpp18
-rw-r--r--lib/Target/X86/X86ELFWriterInfo.h29
-rw-r--r--lib/Target/X86/X86FastISel.cpp1549
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp1187
-rw-r--r--lib/Target/X86/X86FloatingPointRegKill.cpp139
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp1716
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp8794
-rw-r--r--lib/Target/X86/X86ISelLowering.h705
-rw-r--r--lib/Target/X86/X86Instr64bit.td1937
-rw-r--r--lib/Target/X86/X86InstrBuilder.h168
-rw-r--r--lib/Target/X86/X86InstrFPStack.td597
-rw-r--r--lib/Target/X86/X86InstrFormats.td285
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp3227
-rw-r--r--lib/Target/X86/X86InstrInfo.h461
-rw-r--r--lib/Target/X86/X86InstrInfo.td3961
-rw-r--r--lib/Target/X86/X86InstrMMX.td694
-rw-r--r--lib/Target/X86/X86InstrSSE.td3643
-rw-r--r--lib/Target/X86/X86JITInfo.cpp560
-rw-r--r--lib/Target/X86/X86JITInfo.h84
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h112
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp1280
-rw-r--r--lib/Target/X86/X86RegisterInfo.h163
-rw-r--r--lib/Target/X86/X86RegisterInfo.td762
-rw-r--r--lib/Target/X86/X86Relocations.h42
-rw-r--r--lib/Target/X86/X86Subtarget.cpp446
-rw-r--r--lib/Target/X86/X86Subtarget.h224
-rw-r--r--lib/Target/X86/X86TargetAsmInfo.cpp461
-rw-r--r--lib/Target/X86/X86TargetAsmInfo.h75
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp317
-rw-r--r--lib/Target/X86/X86TargetMachine.h124
51 files changed, 40727 insertions, 0 deletions
diff --git a/lib/Target/X86/AsmPrinter/CMakeLists.txt b/lib/Target/X86/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..dbd03d8
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,11 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_partially_linked_object(LLVMX86AsmPrinter
+ X86ATTAsmPrinter.cpp
+ X86AsmPrinter.cpp
+ X86IntelAsmPrinter.cpp
+ )
+
+target_name_of_partially_linked_object(LLVMX86CodeGen n)
+
+add_dependencies(LLVMX86AsmPrinter ${n})
diff --git a/lib/Target/X86/AsmPrinter/Makefile b/lib/Target/X86/AsmPrinter/Makefile
new file mode 100644
index 0000000..ba89ac6
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86AsmPrinter
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.cpp
new file mode 100644
index 0000000..8afe2ea
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.cpp
@@ -0,0 +1,1075 @@
+//===-- X86ATTAsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to AT&T format assembly
+// language. This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86ATTAsmPrinter.h"
+#include "X86.h"
+#include "X86COFF.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "X86TargetAsmInfo.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+static std::string getPICLabelString(unsigned FnNum,
+ const TargetAsmInfo *TAI,
+ const X86Subtarget* Subtarget) {
+ std::string label;
+ if (Subtarget->isTargetDarwin())
+ label = "\"L" + utostr_32(FnNum) + "$pb\"";
+ else if (Subtarget->isTargetELF())
+ label = ".Lllvm$" + utostr_32(FnNum) + "." "$piclabel";
+ else
+ assert(0 && "Don't know how to print PIC label!\n");
+
+ return label;
+}
+
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+ const TargetData *TD) {
+ X86MachineFunctionInfo Info;
+ uint64_t Size = 0;
+
+ switch (F->getCallingConv()) {
+ case CallingConv::X86_StdCall:
+ Info.setDecorationStyle(StdCall);
+ break;
+ case CallingConv::X86_FastCall:
+ Info.setDecorationStyle(FastCall);
+ break;
+ default:
+ return Info;
+ }
+
+ unsigned argNum = 1;
+ for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+ AI != AE; ++AI, ++argNum) {
+ const Type* Ty = AI->getType();
+
+ // 'Dereference' type in case of byval parameter attribute
+ if (F->paramHasAttr(argNum, Attribute::ByVal))
+ Ty = cast<PointerType>(Ty)->getElementType();
+
+ // Size should be aligned to DWORD boundary
+ Size += ((TD->getTypeAllocSize(Ty) + 3)/4)*4;
+ }
+
+ // We're not supporting tooooo huge arguments :)
+ Info.setBytesToPopOnReturn((unsigned int)Size);
+ return Info;
+}
+
+/// PrintUnmangledNameSafely - Print out the printable characters in the name.
+/// Don't print things like \\n or \\0.
+static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
+ for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
+ Name != E; ++Name)
+ if (isprint(*Name))
+ OS << *Name;
+}
+
+/// decorateName - Query FunctionInfoMap and use this information for various
+/// name decoration.
+void X86ATTAsmPrinter::decorateName(std::string &Name,
+ const GlobalValue *GV) {
+ const Function *F = dyn_cast<Function>(GV);
+ if (!F) return;
+
+ // We don't want to decorate non-stdcall or non-fastcall functions right now
+ unsigned CC = F->getCallingConv();
+ if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+ return;
+
+ // Decorate names only when we're targeting Cygwin/Mingw32 targets
+ if (!Subtarget->isTargetCygMing())
+ return;
+
+ FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+
+ const X86MachineFunctionInfo *Info;
+ if (info_item == FunctionInfoMap.end()) {
+ // Calculate apropriate function info and populate map
+ FunctionInfoMap[F] = calculateFunctionInfo(F, TM.getTargetData());
+ Info = &FunctionInfoMap[F];
+ } else {
+ Info = &info_item->second;
+ }
+
+ const FunctionType *FT = F->getFunctionType();
+ switch (Info->getDecorationStyle()) {
+ case None:
+ break;
+ case StdCall:
+ // "Pure" variadic functions do not receive @0 suffix.
+ if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+ (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+ Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+ break;
+ case FastCall:
+ // "Pure" variadic functions do not receive @0 suffix.
+ if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+ (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+ Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+
+ if (Name[0] == '_') {
+ Name[0] = '@';
+ } else {
+ Name = '@' + Name;
+ }
+ break;
+ default:
+ assert(0 && "Unsupported DecorationStyle");
+ }
+}
+
+void X86ATTAsmPrinter::emitFunctionHeader(const MachineFunction &MF) {
+ const Function *F = MF.getFunction();
+
+ decorateName(CurrentFnName, F);
+
+ SwitchToSection(TAI->SectionForGlobal(F));
+
+ unsigned FnAlign = 4;
+ if (F->hasFnAttr(Attribute::OptimizeForSize))
+ FnAlign = 1;
+ switch (F->getLinkage()) {
+ default: assert(0 && "Unknown linkage type!");
+ case Function::InternalLinkage: // Symbols default to internal.
+ case Function::PrivateLinkage:
+ EmitAlignment(FnAlign, F);
+ break;
+ case Function::DLLExportLinkage:
+ case Function::ExternalLinkage:
+ EmitAlignment(FnAlign, F);
+ O << "\t.globl\t" << CurrentFnName << '\n';
+ break;
+ case Function::LinkOnceAnyLinkage:
+ case Function::LinkOnceODRLinkage:
+ case Function::WeakAnyLinkage:
+ case Function::WeakODRLinkage:
+ EmitAlignment(FnAlign, F);
+ if (Subtarget->isTargetDarwin()) {
+ O << "\t.globl\t" << CurrentFnName << '\n';
+ O << TAI->getWeakDefDirective() << CurrentFnName << '\n';
+ } else if (Subtarget->isTargetCygMing()) {
+ O << "\t.globl\t" << CurrentFnName << "\n"
+ "\t.linkonce discard\n";
+ } else {
+ O << "\t.weak\t" << CurrentFnName << '\n';
+ }
+ break;
+ }
+
+ printVisibility(CurrentFnName, F->getVisibility());
+
+ if (Subtarget->isTargetELF())
+ O << "\t.type\t" << CurrentFnName << ",@function\n";
+ else if (Subtarget->isTargetCygMing()) {
+ O << "\t.def\t " << CurrentFnName
+ << ";\t.scl\t" <<
+ (F->hasInternalLinkage() ? COFF::C_STAT : COFF::C_EXT)
+ << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+ << ";\t.endef\n";
+ }
+
+ O << CurrentFnName << ":\n";
+ // Add some workaround for linkonce linkage on Cygwin\MinGW
+ if (Subtarget->isTargetCygMing() &&
+ (F->hasLinkOnceLinkage() || F->hasWeakLinkage()))
+ O << "Lllvm$workaround$fake$stub$" << CurrentFnName << ":\n";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86ATTAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ const Function *F = MF.getFunction();
+ this->MF = &MF;
+ unsigned CC = F->getCallingConv();
+
+ SetupMachineFunction(MF);
+ O << "\n\n";
+
+ // Populate function information map. Actually, We don't want to populate
+ // non-stdcall or non-fastcall functions' information right now.
+ if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+ FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+ // Print out constants referenced by the function
+ EmitConstantPool(MF.getConstantPool());
+
+ if (F->hasDLLExportLinkage())
+ DLLExportedFns.insert(Mang->makeNameProper(F->getName(), ""));
+
+ // Print the 'header' of function
+ emitFunctionHeader(MF);
+
+ // Emit pre-function debug and/or EH information.
+ if (TAI->doesSupportDebugInformation() || TAI->doesSupportExceptionHandling())
+ DW->BeginFunction(&MF);
+
+ // Print out code for the function.
+ bool hasAnyRealCode = false;
+ for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+ I != E; ++I) {
+ // Print a label for the basic block.
+ if (!VerboseAsm && (I->pred_empty() || I->isOnlyReachableByFallthrough())) {
+ // This is an entry block or a block that's only reachable via a
+ // fallthrough edge. In non-VerboseAsm mode, don't print the label.
+ } else {
+ printBasicBlockLabel(I, true, true, VerboseAsm);
+ O << '\n';
+ }
+ for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
+ II != IE; ++II) {
+ // Print the assembly for the instruction.
+ if (!II->isLabel())
+ hasAnyRealCode = true;
+ printMachineInstruction(II);
+ }
+ }
+
+ if (Subtarget->isTargetDarwin() && !hasAnyRealCode) {
+ // If the function is empty, then we need to emit *something*. Otherwise,
+ // the function's label might be associated with something that it wasn't
+ // meant to be associated with. We emit a noop in this situation.
+ // We are assuming inline asms are code.
+ O << "\tnop\n";
+ }
+
+ if (TAI->hasDotTypeDotSizeDirective())
+ O << "\t.size\t" << CurrentFnName << ", .-" << CurrentFnName << '\n';
+
+ // Emit post-function debug information.
+ if (TAI->doesSupportDebugInformation())
+ DW->EndFunction(&MF);
+
+ // Print out jump tables referenced by the function.
+ EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+ O.flush();
+
+ // We didn't modify anything.
+ return false;
+}
+
+static inline bool shouldPrintGOT(TargetMachine &TM, const X86Subtarget* ST) {
+ return ST->isPICStyleGOT() && TM.getRelocationModel() == Reloc::PIC_;
+}
+
+static inline bool shouldPrintPLT(TargetMachine &TM, const X86Subtarget* ST) {
+ return ST->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_ &&
+ (ST->isPICStyleRIPRel() || ST->isPICStyleGOT());
+}
+
+static inline bool shouldPrintStub(TargetMachine &TM, const X86Subtarget* ST) {
+ return ST->isPICStyleStub() && TM.getRelocationModel() != Reloc::Static;
+}
+
+void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *Modifier, bool NotRIPRel) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register: {
+ assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+ "Virtual registers should not make it this far!");
+ O << '%';
+ unsigned Reg = MO.getReg();
+ if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ MVT VT = (strcmp(Modifier+6,"64") == 0) ?
+ MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+ ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+ Reg = getX86SubSuperRegister(Reg, VT);
+ }
+ O << TRI->getAsmName(Reg);
+ return;
+ }
+
+ case MachineOperand::MO_Immediate:
+ if (!Modifier || (strcmp(Modifier, "debug") &&
+ strcmp(Modifier, "mem") &&
+ strcmp(Modifier, "call")))
+ O << '$';
+ O << MO.getImm();
+ return;
+ case MachineOperand::MO_MachineBasicBlock:
+ printBasicBlockLabel(MO.getMBB(), false, false, VerboseAsm);
+ return;
+ case MachineOperand::MO_JumpTableIndex: {
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ if (!isMemOp) O << '$';
+ O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_'
+ << MO.getIndex();
+
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ if (Subtarget->isPICStyleStub())
+ O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+ << "$pb\"";
+ else if (Subtarget->isPICStyleGOT())
+ O << "@GOTOFF";
+ }
+
+ if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+ O << "(%rip)";
+ return;
+ }
+ case MachineOperand::MO_ConstantPoolIndex: {
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ if (!isMemOp) O << '$';
+ O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+ << MO.getIndex();
+
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ if (Subtarget->isPICStyleStub())
+ O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+ << "$pb\"";
+ else if (Subtarget->isPICStyleGOT())
+ O << "@GOTOFF";
+ }
+
+ printOffset(MO.getOffset());
+
+ if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+ O << "(%rip)";
+ return;
+ }
+ case MachineOperand::MO_GlobalAddress: {
+ bool isCallOp = Modifier && !strcmp(Modifier, "call");
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ bool needCloseParen = false;
+
+ const GlobalValue *GV = MO.getGlobal();
+ const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+ if (!GVar) {
+ // If GV is an alias then use the aliasee for determining
+ // thread-localness.
+ if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+ GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+ }
+
+ bool isThreadLocal = GVar && GVar->isThreadLocal();
+
+ std::string Name = Mang->getValueName(GV);
+ decorateName(Name, GV);
+
+ if (!isMemOp && !isCallOp)
+ O << '$';
+ else if (Name[0] == '$') {
+ // The name begins with a dollar-sign. In order to avoid having it look
+ // like an integer immediate to the assembler, enclose it in parens.
+ O << '(';
+ needCloseParen = true;
+ }
+
+ if (shouldPrintStub(TM, Subtarget)) {
+ // Link-once, declaration, or Weakly-linked global variables need
+ // non-lazily-resolved stubs
+ if (GV->isDeclaration() || GV->isWeakForLinker()) {
+ // Dynamically-resolved functions need a stub for the function.
+ if (isCallOp && isa<Function>(GV)) {
+ // Function stubs are no longer needed for Mac OS X 10.5 and up.
+ if (Subtarget->isTargetDarwin() && Subtarget->getDarwinVers() >= 9) {
+ O << Name;
+ } else {
+ FnStubs.insert(Name);
+ printSuffixedName(Name, "$stub");
+ }
+ } else if (GV->hasHiddenVisibility()) {
+ if (!GV->isDeclaration() && !GV->hasCommonLinkage())
+ // Definition is not definitely in the current translation unit.
+ O << Name;
+ else {
+ HiddenGVStubs.insert(Name);
+ printSuffixedName(Name, "$non_lazy_ptr");
+ }
+ } else {
+ GVStubs.insert(Name);
+ printSuffixedName(Name, "$non_lazy_ptr");
+ }
+ } else {
+ if (GV->hasDLLImportLinkage())
+ O << "__imp_";
+ O << Name;
+ }
+
+ if (!isCallOp && TM.getRelocationModel() == Reloc::PIC_)
+ O << '-' << getPICLabelString(getFunctionNumber(), TAI, Subtarget);
+ } else {
+ if (GV->hasDLLImportLinkage()) {
+ O << "__imp_";
+ }
+ O << Name;
+
+ if (isCallOp) {
+ if (shouldPrintPLT(TM, Subtarget)) {
+ // Assemble call via PLT for externally visible symbols
+ if (!GV->hasHiddenVisibility() && !GV->hasProtectedVisibility() &&
+ !GV->hasLocalLinkage())
+ O << "@PLT";
+ }
+ if (Subtarget->isTargetCygMing() && GV->isDeclaration())
+ // Save function name for later type emission
+ FnStubs.insert(Name);
+ }
+ }
+
+ if (GV->hasExternalWeakLinkage())
+ ExtWeakSymbols.insert(GV);
+
+ printOffset(MO.getOffset());
+
+ if (isThreadLocal) {
+ TLSModel::Model model = getTLSModel(GVar, TM.getRelocationModel());
+ switch (model) {
+ case TLSModel::GeneralDynamic:
+ O << "@TLSGD";
+ break;
+ case TLSModel::LocalDynamic:
+ // O << "@TLSLD"; // local dynamic not implemented
+ O << "@TLSGD";
+ break;
+ case TLSModel::InitialExec:
+ if (Subtarget->is64Bit()) {
+ assert (!NotRIPRel);
+ O << "@GOTTPOFF(%rip)";
+ } else {
+ O << "@INDNTPOFF";
+ }
+ break;
+ case TLSModel::LocalExec:
+ if (Subtarget->is64Bit())
+ O << "@TPOFF";
+ else
+ O << "@NTPOFF";
+ break;
+ default:
+ assert (0 && "Unknown TLS model");
+ }
+ } else if (isMemOp) {
+ if (shouldPrintGOT(TM, Subtarget)) {
+ if (Subtarget->GVRequiresExtraLoad(GV, TM, false))
+ O << "@GOT";
+ else
+ O << "@GOTOFF";
+ } else if (Subtarget->isPICStyleRIPRel() && !NotRIPRel) {
+ if (TM.getRelocationModel() != Reloc::Static) {
+ if (Subtarget->GVRequiresExtraLoad(GV, TM, false))
+ O << "@GOTPCREL";
+
+ if (needCloseParen) {
+ needCloseParen = false;
+ O << ')';
+ }
+ }
+
+ // Use rip when possible to reduce code size, except when
+ // index or base register are also part of the address. e.g.
+ // foo(%rip)(%rcx,%rax,4) is not legal
+ O << "(%rip)";
+ }
+ }
+
+ if (needCloseParen)
+ O << ')';
+
+ return;
+ }
+ case MachineOperand::MO_ExternalSymbol: {
+ bool isCallOp = Modifier && !strcmp(Modifier, "call");
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ bool needCloseParen = false;
+ std::string Name(TAI->getGlobalPrefix());
+ Name += MO.getSymbolName();
+ // Print function stub suffix unless it's Mac OS X 10.5 and up.
+ if (isCallOp && shouldPrintStub(TM, Subtarget) &&
+ !(Subtarget->isTargetDarwin() && Subtarget->getDarwinVers() >= 9)) {
+ FnStubs.insert(Name);
+ printSuffixedName(Name, "$stub");
+ return;
+ }
+ if (!isMemOp && !isCallOp)
+ O << '$';
+ else if (Name[0] == '$') {
+ // The name begins with a dollar-sign. In order to avoid having it look
+ // like an integer immediate to the assembler, enclose it in parens.
+ O << '(';
+ needCloseParen = true;
+ }
+
+ O << Name;
+
+ if (shouldPrintPLT(TM, Subtarget)) {
+ std::string GOTName(TAI->getGlobalPrefix());
+ GOTName+="_GLOBAL_OFFSET_TABLE_";
+ if (Name == GOTName)
+ // HACK! Emit extra offset to PC during printing GOT offset to
+ // compensate for the size of popl instruction. The resulting code
+ // should look like:
+ // call .piclabel
+ // piclabel:
+ // popl %some_register
+ // addl $_GLOBAL_ADDRESS_TABLE_ + [.-piclabel], %some_register
+ O << " + [.-"
+ << getPICLabelString(getFunctionNumber(), TAI, Subtarget) << ']';
+
+ if (isCallOp)
+ O << "@PLT";
+ }
+
+ if (needCloseParen)
+ O << ')';
+
+ if (!isCallOp && Subtarget->isPICStyleRIPRel())
+ O << "(%rip)";
+
+ return;
+ }
+ default:
+ O << "<unknown operand type>"; return;
+ }
+}
+
+void X86ATTAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+ unsigned char value = MI->getOperand(Op).getImm();
+ assert(value <= 7 && "Invalid ssecc argument!");
+ switch (value) {
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ }
+}
+
+void X86ATTAsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier,
+ bool NotRIPRel) {
+ MachineOperand BaseReg = MI->getOperand(Op);
+ MachineOperand IndexReg = MI->getOperand(Op+2);
+ const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+ NotRIPRel |= IndexReg.getReg() || BaseReg.getReg();
+ if (DispSpec.isGlobal() ||
+ DispSpec.isCPI() ||
+ DispSpec.isJTI() ||
+ DispSpec.isSymbol()) {
+ printOperand(MI, Op+3, "mem", NotRIPRel);
+ } else {
+ int DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+ O << DispVal;
+ }
+
+ if (IndexReg.getReg() || BaseReg.getReg()) {
+ unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+ unsigned BaseRegOperand = 0, IndexRegOperand = 2;
+
+ // There are cases where we can end up with ESP/RSP in the indexreg slot.
+ // If this happens, swap the base/index register to support assemblers that
+ // don't work when the index is *SP.
+ if (IndexReg.getReg() == X86::ESP || IndexReg.getReg() == X86::RSP) {
+ assert(ScaleVal == 1 && "Scale not supported for stack pointer!");
+ std::swap(BaseReg, IndexReg);
+ std::swap(BaseRegOperand, IndexRegOperand);
+ }
+
+ O << '(';
+ if (BaseReg.getReg())
+ printOperand(MI, Op+BaseRegOperand, Modifier);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op+IndexRegOperand, Modifier);
+ if (ScaleVal != 1)
+ O << ',' << ScaleVal;
+ }
+ O << ')';
+ }
+}
+
+void X86ATTAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier, bool NotRIPRel){
+ assert(isMem(MI, Op) && "Invalid memory reference!");
+ MachineOperand Segment = MI->getOperand(Op+4);
+ if (Segment.getReg()) {
+ printOperand(MI, Op+4, Modifier);
+ O << ':';
+ }
+ printLeaMemReference(MI, Op, Modifier, NotRIPRel);
+}
+
+void X86ATTAsmPrinter::printPICJumpTableSetLabel(unsigned uid,
+ const MachineBasicBlock *MBB) const {
+ if (!TAI->getSetDirective())
+ return;
+
+ // We don't need .set machinery if we have GOT-style relocations
+ if (Subtarget->isPICStyleGOT())
+ return;
+
+ O << TAI->getSetDirective() << ' ' << TAI->getPrivateGlobalPrefix()
+ << getFunctionNumber() << '_' << uid << "_set_" << MBB->getNumber() << ',';
+ printBasicBlockLabel(MBB, false, false, false);
+ if (Subtarget->isPICStyleRIPRel())
+ O << '-' << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+ << '_' << uid << '\n';
+ else
+ O << '-' << getPICLabelString(getFunctionNumber(), TAI, Subtarget) << '\n';
+}
+
+void X86ATTAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+ std::string label = getPICLabelString(getFunctionNumber(), TAI, Subtarget);
+ O << label << '\n' << label << ':';
+}
+
+
+void X86ATTAsmPrinter::printPICJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned uid) const
+{
+ const char *JTEntryDirective = MJTI->getEntrySize() == 4 ?
+ TAI->getData32bitsDirective() : TAI->getData64bitsDirective();
+
+ O << JTEntryDirective << ' ';
+
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ if (Subtarget->isPICStyleRIPRel() || Subtarget->isPICStyleStub()) {
+ O << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+ << '_' << uid << "_set_" << MBB->getNumber();
+ } else if (Subtarget->isPICStyleGOT()) {
+ printBasicBlockLabel(MBB, false, false, false);
+ O << "@GOTOFF";
+ } else
+ assert(0 && "Don't know how to print MBB label for this PIC mode");
+ } else
+ printBasicBlockLabel(MBB, false, false, false);
+}
+
+bool X86ATTAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+ const char Mode) {
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default: return true; // Unknown mode.
+ case 'b': // Print QImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8);
+ break;
+ case 'h': // Print QImode high register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+ break;
+ case 'w': // Print HImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i16);
+ break;
+ case 'k': // Print SImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i32);
+ break;
+ case 'q': // Print DImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i64);
+ break;
+ }
+
+ O << '%'<< TRI->getAsmName(Reg);
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86ATTAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'c': // Don't print "$" before a global var name or constant.
+ printOperand(MI, OpNo, "mem", /*NotRIPRel=*/true);
+ return false;
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print DImode register
+ if (MI->getOperand(OpNo).isReg())
+ return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+ printOperand(MI, OpNo);
+ return false;
+
+ case 'P': // Don't print @PLT, but do print as memory.
+ printOperand(MI, OpNo, "mem", /*NotRIPRel=*/true);
+ return false;
+ }
+ }
+
+ printOperand(MI, OpNo);
+ return false;
+}
+
+bool X86ATTAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print SImode register
+ // These only apply to registers, ignore on mem.
+ break;
+ case 'P': // Don't print @PLT, but do print as memory.
+ printMemReference(MI, OpNo, "mem", /*NotRIPRel=*/true);
+ return false;
+ }
+ }
+ printMemReference(MI, OpNo);
+ return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction MI in
+/// AT&T syntax to the current output stream.
+///
+void X86ATTAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+ ++EmittedInsts;
+
+ // Call the autogenerated instruction printer routines.
+ printInstruction(MI);
+}
+
+/// doInitialization
+bool X86ATTAsmPrinter::doInitialization(Module &M) {
+
+ bool Result = AsmPrinter::doInitialization(M);
+
+ if (TAI->doesSupportDebugInformation()) {
+ // Let PassManager know we need debug information and relay
+ // the MachineModuleInfo address on to DwarfWriter.
+ // AsmPrinter::doInitialization did this analysis.
+ MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+ DW = getAnalysisIfAvailable<DwarfWriter>();
+ DW->BeginModule(&M, MMI, O, this, TAI);
+ }
+
+ // Darwin wants symbols to be quoted if they have complex names.
+ if (Subtarget->isTargetDarwin())
+ Mang->setUseQuotes(true);
+
+ return Result;
+}
+
+
+void X86ATTAsmPrinter::printModuleLevelGV(const GlobalVariable* GVar) {
+ const TargetData *TD = TM.getTargetData();
+
+ if (!GVar->hasInitializer())
+ return; // External global require no code
+
+ // Check to see if this is a special global used by LLVM, if so, emit it.
+ if (EmitSpecialLLVMGlobal(GVar)) {
+ if (Subtarget->isTargetDarwin() &&
+ TM.getRelocationModel() == Reloc::Static) {
+ if (GVar->getName() == "llvm.global_ctors")
+ O << ".reference .constructors_used\n";
+ else if (GVar->getName() == "llvm.global_dtors")
+ O << ".reference .destructors_used\n";
+ }
+ return;
+ }
+
+ std::string name = Mang->getValueName(GVar);
+ Constant *C = GVar->getInitializer();
+ const Type *Type = C->getType();
+ unsigned Size = TD->getTypeAllocSize(Type);
+ unsigned Align = TD->getPreferredAlignmentLog(GVar);
+
+ printVisibility(name, GVar->getVisibility());
+
+ if (Subtarget->isTargetELF())
+ O << "\t.type\t" << name << ",@object\n";
+
+ SwitchToSection(TAI->SectionForGlobal(GVar));
+
+ if (C->isNullValue() && !GVar->hasSection() &&
+ !(Subtarget->isTargetDarwin() &&
+ TAI->SectionKindForGlobal(GVar) == SectionKind::RODataMergeStr)) {
+ // FIXME: This seems to be pretty darwin-specific
+ if (GVar->hasExternalLinkage()) {
+ if (const char *Directive = TAI->getZeroFillDirective()) {
+ O << "\t.globl " << name << '\n';
+ O << Directive << "__DATA, __common, " << name << ", "
+ << Size << ", " << Align << '\n';
+ return;
+ }
+ }
+
+ if (!GVar->isThreadLocal() &&
+ (GVar->hasLocalLinkage() || GVar->isWeakForLinker())) {
+ if (Size == 0) Size = 1; // .comm Foo, 0 is undefined, avoid it.
+
+ if (TAI->getLCOMMDirective() != NULL) {
+ if (GVar->hasLocalLinkage()) {
+ O << TAI->getLCOMMDirective() << name << ',' << Size;
+ if (Subtarget->isTargetDarwin())
+ O << ',' << Align;
+ } else if (Subtarget->isTargetDarwin() && !GVar->hasCommonLinkage()) {
+ O << "\t.globl " << name << '\n'
+ << TAI->getWeakDefDirective() << name << '\n';
+ EmitAlignment(Align, GVar);
+ O << name << ":";
+ if (VerboseAsm) {
+ O << "\t\t\t\t" << TAI->getCommentString() << ' ';
+ PrintUnmangledNameSafely(GVar, O);
+ }
+ O << '\n';
+ EmitGlobalConstant(C);
+ return;
+ } else {
+ O << TAI->getCOMMDirective() << name << ',' << Size;
+ if (TAI->getCOMMDirectiveTakesAlignment())
+ O << ',' << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+ }
+ } else {
+ if (!Subtarget->isTargetCygMing()) {
+ if (GVar->hasLocalLinkage())
+ O << "\t.local\t" << name << '\n';
+ }
+ O << TAI->getCOMMDirective() << name << ',' << Size;
+ if (TAI->getCOMMDirectiveTakesAlignment())
+ O << ',' << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+ }
+ if (VerboseAsm) {
+ O << "\t\t" << TAI->getCommentString() << ' ';
+ PrintUnmangledNameSafely(GVar, O);
+ }
+ O << '\n';
+ return;
+ }
+ }
+
+ switch (GVar->getLinkage()) {
+ case GlobalValue::CommonLinkage:
+ case GlobalValue::LinkOnceAnyLinkage:
+ case GlobalValue::LinkOnceODRLinkage:
+ case GlobalValue::WeakAnyLinkage:
+ case GlobalValue::WeakODRLinkage:
+ if (Subtarget->isTargetDarwin()) {
+ O << "\t.globl " << name << '\n'
+ << TAI->getWeakDefDirective() << name << '\n';
+ } else if (Subtarget->isTargetCygMing()) {
+ O << "\t.globl\t" << name << "\n"
+ "\t.linkonce same_size\n";
+ } else {
+ O << "\t.weak\t" << name << '\n';
+ }
+ break;
+ case GlobalValue::DLLExportLinkage:
+ case GlobalValue::AppendingLinkage:
+ // FIXME: appending linkage variables should go into a section of
+ // their name or something. For now, just emit them as external.
+ case GlobalValue::ExternalLinkage:
+ // If external or appending, declare as a global symbol
+ O << "\t.globl " << name << '\n';
+ // FALL THROUGH
+ case GlobalValue::PrivateLinkage:
+ case GlobalValue::InternalLinkage:
+ break;
+ default:
+ assert(0 && "Unknown linkage type!");
+ }
+
+ EmitAlignment(Align, GVar);
+ O << name << ":";
+ if (VerboseAsm){
+ O << "\t\t\t\t" << TAI->getCommentString() << ' ';
+ PrintUnmangledNameSafely(GVar, O);
+ }
+ O << '\n';
+ if (TAI->hasDotTypeDotSizeDirective())
+ O << "\t.size\t" << name << ", " << Size << '\n';
+
+ EmitGlobalConstant(C);
+}
+
+/// printGVStub - Print stub for a global value.
+///
+void X86ATTAsmPrinter::printGVStub(const char *GV, const char *Prefix) {
+ printSuffixedName(GV, "$non_lazy_ptr", Prefix);
+ O << ":\n\t.indirect_symbol ";
+ if (Prefix) O << Prefix;
+ O << GV << "\n\t.long\t0\n";
+}
+
+/// printHiddenGVStub - Print stub for a hidden global value.
+///
+void X86ATTAsmPrinter::printHiddenGVStub(const char *GV, const char *Prefix) {
+ EmitAlignment(2);
+ printSuffixedName(GV, "$non_lazy_ptr", Prefix);
+ if (Prefix) O << Prefix;
+ O << ":\n" << TAI->getData32bitsDirective() << GV << '\n';
+}
+
+
+bool X86ATTAsmPrinter::doFinalization(Module &M) {
+ // Print out module-level global variables here.
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ printModuleLevelGV(I);
+
+ if (I->hasDLLExportLinkage())
+ DLLExportedGVs.insert(Mang->makeNameProper(I->getName(),""));
+
+ // If the global is a extern weak symbol, remember to emit the weak
+ // reference!
+ // FIXME: This is rather hacky, since we'll emit references to ALL weak
+ // stuff, not used. But currently it's the only way to deal with extern weak
+ // initializers hidden deep inside constant expressions.
+ if (I->hasExternalWeakLinkage())
+ ExtWeakSymbols.insert(I);
+ }
+
+ for (Module::const_iterator I = M.begin(), E = M.end();
+ I != E; ++I) {
+ // If the global is a extern weak symbol, remember to emit the weak
+ // reference!
+ // FIXME: This is rather hacky, since we'll emit references to ALL weak
+ // stuff, not used. But currently it's the only way to deal with extern weak
+ // initializers hidden deep inside constant expressions.
+ if (I->hasExternalWeakLinkage())
+ ExtWeakSymbols.insert(I);
+ }
+
+ // Output linker support code for dllexported globals
+ if (!DLLExportedGVs.empty())
+ SwitchToDataSection(".section .drectve");
+
+ for (StringSet<>::iterator i = DLLExportedGVs.begin(),
+ e = DLLExportedGVs.end();
+ i != e; ++i)
+ O << "\t.ascii \" -export:" << i->getKeyData() << ",data\"\n";
+
+ if (!DLLExportedFns.empty()) {
+ SwitchToDataSection(".section .drectve");
+ }
+
+ for (StringSet<>::iterator i = DLLExportedFns.begin(),
+ e = DLLExportedFns.end();
+ i != e; ++i)
+ O << "\t.ascii \" -export:" << i->getKeyData() << "\"\n";
+
+ if (Subtarget->isTargetDarwin()) {
+ SwitchToDataSection("");
+
+ // Output stubs for dynamically-linked functions
+ for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
+ i != e; ++i) {
+ SwitchToDataSection("\t.section __IMPORT,__jump_table,symbol_stubs,"
+ "self_modifying_code+pure_instructions,5", 0);
+ const char *p = i->getKeyData();
+ printSuffixedName(p, "$stub");
+ O << ":\n"
+ "\t.indirect_symbol " << p << "\n"
+ "\thlt ; hlt ; hlt ; hlt ; hlt\n";
+ }
+
+ O << '\n';
+
+ // Print global value stubs.
+ bool InStubSection = false;
+ if (TAI->doesSupportExceptionHandling() && MMI && !Subtarget->is64Bit()) {
+ // Add the (possibly multiple) personalities to the set of global values.
+ // Only referenced functions get into the Personalities list.
+ const std::vector<Function *>& Personalities = MMI->getPersonalities();
+ for (std::vector<Function *>::const_iterator I = Personalities.begin(),
+ E = Personalities.end(); I != E; ++I) {
+ if (!*I)
+ continue;
+ if (!InStubSection) {
+ SwitchToDataSection(
+ "\t.section __IMPORT,__pointers,non_lazy_symbol_pointers");
+ InStubSection = true;
+ }
+ printGVStub((*I)->getNameStart(), "_");
+ }
+ }
+
+ // Output stubs for external and common global variables.
+ if (!InStubSection && !GVStubs.empty())
+ SwitchToDataSection(
+ "\t.section __IMPORT,__pointers,non_lazy_symbol_pointers");
+ for (StringSet<>::iterator i = GVStubs.begin(), e = GVStubs.end();
+ i != e; ++i)
+ printGVStub(i->getKeyData());
+
+ if (!HiddenGVStubs.empty()) {
+ SwitchToSection(TAI->getDataSection());
+ for (StringSet<>::iterator i = HiddenGVStubs.begin(), e = HiddenGVStubs.end();
+ i != e; ++i)
+ printHiddenGVStub(i->getKeyData());
+ }
+
+ // Emit final debug information.
+ DwarfWriter *DW = getAnalysisIfAvailable<DwarfWriter>();
+ DW->EndModule();
+
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never
+ // generates code that does this, it is always safe to set.
+ O << "\t.subsections_via_symbols\n";
+ } else if (Subtarget->isTargetCygMing()) {
+ // Emit type information for external functions
+ for (StringSet<>::iterator i = FnStubs.begin(), e = FnStubs.end();
+ i != e; ++i) {
+ O << "\t.def\t " << i->getKeyData()
+ << ";\t.scl\t" << COFF::C_EXT
+ << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+ << ";\t.endef\n";
+ }
+
+ // Emit final debug information.
+ DwarfWriter *DW = getAnalysisIfAvailable<DwarfWriter>();
+ DW->EndModule();
+ } else if (Subtarget->isTargetELF()) {
+ // Emit final debug information.
+ DwarfWriter *DW = getAnalysisIfAvailable<DwarfWriter>();
+ DW->EndModule();
+ }
+
+ return AsmPrinter::doFinalization(M);
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter.inc"
diff --git a/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.h b/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.h
new file mode 100644
index 0000000..5b40e73
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86ATTAsmPrinter.h
@@ -0,0 +1,164 @@
+//===-- X86ATTAsmPrinter.h - Convert X86 LLVM code to AT&T assembly -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AT&T assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ATTASMPRINTER_H
+#define X86ATTASMPRINTER_H
+
+#include "../X86.h"
+#include "../X86MachineFunctionInfo.h"
+#include "../X86TargetMachine.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MachineJumpTableInfo;
+
+class VISIBILITY_HIDDEN X86ATTAsmPrinter : public AsmPrinter {
+ DwarfWriter *DW;
+ MachineModuleInfo *MMI;
+ const X86Subtarget *Subtarget;
+ public:
+ explicit X86ATTAsmPrinter(raw_ostream &O, X86TargetMachine &TM,
+ const TargetAsmInfo *T, CodeGenOpt::Level OL,
+ bool V)
+ : AsmPrinter(O, TM, T, OL, V), DW(0), MMI(0) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ }
+
+ virtual const char *getPassName() const {
+ return "X86 AT&T-Style Assembly Printer";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ if (Subtarget->isTargetDarwin() ||
+ Subtarget->isTargetELF() ||
+ Subtarget->isTargetCygMing()) {
+ AU.addRequired<MachineModuleInfo>();
+ }
+ AU.addRequired<DwarfWriter>();
+ AsmPrinter::getAnalysisUsage(AU);
+ }
+
+ bool doInitialization(Module &M);
+ bool doFinalization(Module &M);
+
+ /// printInstruction - This method is automatically generated by tablegen
+ /// from the instruction set description. This method returns true if the
+ /// machine instruction was sufficiently described to print it, otherwise it
+ /// returns false.
+ bool printInstruction(const MachineInstr *MI);
+
+ // These methods are used by the tablegen'erated instruction printer.
+ void printOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *Modifier = 0, bool NotRIPRel = false);
+ void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printf80mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printlea32mem(const MachineInstr *MI, unsigned OpNo) {
+ printLeaMemReference(MI, OpNo);
+ }
+ void printlea64mem(const MachineInstr *MI, unsigned OpNo) {
+ printLeaMemReference(MI, OpNo);
+ }
+ void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+ printLeaMemReference(MI, OpNo, "subreg64");
+ }
+
+ bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode);
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode);
+
+ void printMachineInstruction(const MachineInstr *MI);
+ void printSSECC(const MachineInstr *MI, unsigned Op);
+ void printMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier=NULL, bool NotRIPRel = false);
+ void printLeaMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier=NULL, bool NotRIPRel = false);
+ void printPICJumpTableSetLabel(unsigned uid,
+ const MachineBasicBlock *MBB) const;
+ void printPICJumpTableSetLabel(unsigned uid, unsigned uid2,
+ const MachineBasicBlock *MBB) const {
+ AsmPrinter::printPICJumpTableSetLabel(uid, uid2, MBB);
+ }
+ void printPICJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned uid) const;
+
+ void printPICLabel(const MachineInstr *MI, unsigned Op);
+ void printModuleLevelGV(const GlobalVariable* GVar);
+
+ void printGVStub(const char *GV, const char *Prefix = NULL);
+ void printHiddenGVStub(const char *GV, const char *Prefix = NULL);
+
+ bool runOnMachineFunction(MachineFunction &F);
+
+ void emitFunctionHeader(const MachineFunction &MF);
+
+ // Necessary for Darwin to print out the apprioriate types of linker stubs
+ StringSet<> FnStubs, GVStubs, HiddenGVStubs;
+
+ // Necessary for dllexport support
+ StringSet<> DLLExportedFns, DLLExportedGVs;
+
+ // We have to propagate some information about MachineFunction to
+ // AsmPrinter. It's ok, when we're printing the function, since we have
+ // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+ // Unfortunately, this is not possible when we're printing reference to
+ // Function (e.g. calling it and so on). Even more, there is no way to get the
+ // corresponding MachineFunctions: it can even be not created at all. That's
+ // why we should use additional structure, when we're collecting all necessary
+ // information.
+ //
+ // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+ // function, since we have to use arguments' size for decoration.
+ typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+ FMFInfoMap FunctionInfoMap;
+
+ void decorateName(std::string& Name, const GlobalValue* GV);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
new file mode 100644
index 0000000..c874849
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
@@ -0,0 +1,50 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM IR to X86 assembly -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file the shared super class printer that converts from our internal
+// representation of machine-dependent LLVM code to Intel and AT&T format
+// assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTAsmPrinter.h"
+#include "X86IntelAsmPrinter.h"
+#include "X86Subtarget.h"
+using namespace llvm;
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86 assembly code
+/// for a MachineFunction to the given output stream, using the given target
+/// machine description.
+///
+FunctionPass *llvm::createX86CodePrinterPass(raw_ostream &o,
+ X86TargetMachine &tm,
+ CodeGenOpt::Level OptLevel,
+ bool verbose) {
+ const X86Subtarget *Subtarget = &tm.getSubtarget<X86Subtarget>();
+
+ if (Subtarget->isFlavorIntel()) {
+ return new X86IntelAsmPrinter(o, tm, tm.getTargetAsmInfo(),
+ OptLevel, verbose);
+ } else {
+ return new X86ATTAsmPrinter(o, tm, tm.getTargetAsmInfo(),
+ OptLevel, verbose);
+ }
+}
+
+namespace {
+ static struct Register {
+ Register() {
+ X86TargetMachine::registerAsmPrinter(createX86CodePrinterPass);
+ }
+ } Registrator;
+}
+
+extern "C" int X86AsmPrinterForceLink;
+int X86AsmPrinterForceLink = 0;
diff --git a/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.cpp
new file mode 100644
index 0000000..6599349
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.cpp
@@ -0,0 +1,609 @@
+//===-- X86IntelAsmPrinter.cpp - Convert X86 LLVM code to Intel assembly --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Intel format assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelAsmPrinter.h"
+#include "X86InstrInfo.h"
+#include "X86TargetAsmInfo.h"
+#include "X86.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+ const TargetData *TD) {
+ X86MachineFunctionInfo Info;
+ uint64_t Size = 0;
+
+ switch (F->getCallingConv()) {
+ case CallingConv::X86_StdCall:
+ Info.setDecorationStyle(StdCall);
+ break;
+ case CallingConv::X86_FastCall:
+ Info.setDecorationStyle(FastCall);
+ break;
+ default:
+ return Info;
+ }
+
+ unsigned argNum = 1;
+ for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+ AI != AE; ++AI, ++argNum) {
+ const Type* Ty = AI->getType();
+
+ // 'Dereference' type in case of byval parameter attribute
+ if (F->paramHasAttr(argNum, Attribute::ByVal))
+ Ty = cast<PointerType>(Ty)->getElementType();
+
+ // Size should be aligned to DWORD boundary
+ Size += ((TD->getTypeAllocSize(Ty) + 3)/4)*4;
+ }
+
+ // We're not supporting tooooo huge arguments :)
+ Info.setBytesToPopOnReturn((unsigned int)Size);
+ return Info;
+}
+
+
+/// decorateName - Query FunctionInfoMap and use this information for various
+/// name decoration.
+void X86IntelAsmPrinter::decorateName(std::string &Name,
+ const GlobalValue *GV) {
+ const Function *F = dyn_cast<Function>(GV);
+ if (!F) return;
+
+ // We don't want to decorate non-stdcall or non-fastcall functions right now
+ unsigned CC = F->getCallingConv();
+ if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+ return;
+
+ FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+
+ const X86MachineFunctionInfo *Info;
+ if (info_item == FunctionInfoMap.end()) {
+ // Calculate apropriate function info and populate map
+ FunctionInfoMap[F] = calculateFunctionInfo(F, TM.getTargetData());
+ Info = &FunctionInfoMap[F];
+ } else {
+ Info = &info_item->second;
+ }
+
+ const FunctionType *FT = F->getFunctionType();
+ switch (Info->getDecorationStyle()) {
+ case None:
+ break;
+ case StdCall:
+ // "Pure" variadic functions do not receive @0 suffix.
+ if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+ (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+ Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+ break;
+ case FastCall:
+ // "Pure" variadic functions do not receive @0 suffix.
+ if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+ (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+ Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+
+ if (Name[0] == '_')
+ Name[0] = '@';
+ else
+ Name = '@' + Name;
+
+ break;
+ default:
+ assert(0 && "Unsupported DecorationStyle");
+ }
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86IntelAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ this->MF = &MF;
+ SetupMachineFunction(MF);
+ O << "\n\n";
+
+ // Print out constants referenced by the function
+ EmitConstantPool(MF.getConstantPool());
+
+ // Print out labels for the function.
+ const Function *F = MF.getFunction();
+ unsigned CC = F->getCallingConv();
+
+ // Populate function information map. Actually, We don't want to populate
+ // non-stdcall or non-fastcall functions' information right now.
+ if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+ FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+ decorateName(CurrentFnName, F);
+
+ SwitchToTextSection("_text", F);
+
+ unsigned FnAlign = 4;
+ if (F->hasFnAttr(Attribute::OptimizeForSize))
+ FnAlign = 1;
+ switch (F->getLinkage()) {
+ default: assert(0 && "Unsupported linkage type!");
+ case Function::PrivateLinkage:
+ case Function::InternalLinkage:
+ EmitAlignment(FnAlign);
+ break;
+ case Function::DLLExportLinkage:
+ DLLExportedFns.insert(CurrentFnName);
+ //FALLS THROUGH
+ case Function::ExternalLinkage:
+ O << "\tpublic " << CurrentFnName << "\n";
+ EmitAlignment(FnAlign);
+ break;
+ }
+
+ O << CurrentFnName << "\tproc near\n";
+
+ // Print out code for the function.
+ for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+ I != E; ++I) {
+ // Print a label for the basic block if there are any predecessors.
+ if (!I->pred_empty()) {
+ printBasicBlockLabel(I, true, true);
+ O << '\n';
+ }
+ for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+ II != E; ++II) {
+ // Print the assembly for the instruction.
+ printMachineInstruction(II);
+ }
+ }
+
+ // Print out jump tables referenced by the function.
+ EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+ O << CurrentFnName << "\tendp\n";
+
+ O.flush();
+
+ // We didn't modify anything.
+ return false;
+}
+
+void X86IntelAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+ unsigned char value = MI->getOperand(Op).getImm();
+ assert(value <= 7 && "Invalid ssecc argument!");
+ switch (value) {
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ }
+}
+
+void X86IntelAsmPrinter::printOp(const MachineOperand &MO,
+ const char *Modifier) {
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register: {
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ unsigned Reg = MO.getReg();
+ if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ MVT VT = (strcmp(Modifier,"subreg64") == 0) ?
+ MVT::i64 : ((strcmp(Modifier, "subreg32") == 0) ? MVT::i32 :
+ ((strcmp(Modifier,"subreg16") == 0) ? MVT::i16 :MVT::i8));
+ Reg = getX86SubSuperRegister(Reg, VT);
+ }
+ O << TRI->getName(Reg);
+ } else
+ O << "reg" << MO.getReg();
+ return;
+ }
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return;
+ case MachineOperand::MO_MachineBasicBlock:
+ printBasicBlockLabel(MO.getMBB());
+ return;
+ case MachineOperand::MO_JumpTableIndex: {
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ if (!isMemOp) O << "OFFSET ";
+ O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+ << "_" << MO.getIndex();
+ return;
+ }
+ case MachineOperand::MO_ConstantPoolIndex: {
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ if (!isMemOp) O << "OFFSET ";
+ O << "[" << TAI->getPrivateGlobalPrefix() << "CPI"
+ << getFunctionNumber() << "_" << MO.getIndex();
+ printOffset(MO.getOffset());
+ O << "]";
+ return;
+ }
+ case MachineOperand::MO_GlobalAddress: {
+ bool isCallOp = Modifier && !strcmp(Modifier, "call");
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ GlobalValue *GV = MO.getGlobal();
+ std::string Name = Mang->getValueName(GV);
+
+ decorateName(Name, GV);
+
+ if (!isMemOp && !isCallOp) O << "OFFSET ";
+ if (GV->hasDLLImportLinkage()) {
+ // FIXME: This should be fixed with full support of stdcall & fastcall
+ // CC's
+ O << "__imp_";
+ }
+ O << Name;
+ printOffset(MO.getOffset());
+ return;
+ }
+ case MachineOperand::MO_ExternalSymbol: {
+ bool isCallOp = Modifier && !strcmp(Modifier, "call");
+ if (!isCallOp) O << "OFFSET ";
+ O << TAI->getGlobalPrefix() << MO.getSymbolName();
+ return;
+ }
+ default:
+ O << "<unknown operand type>"; return;
+ }
+}
+
+void X86IntelAsmPrinter::printLeaMemReference(const MachineInstr *MI,
+ unsigned Op,
+ const char *Modifier) {
+ const MachineOperand &BaseReg = MI->getOperand(Op);
+ int ScaleVal = MI->getOperand(Op+1).getImm();
+ const MachineOperand &IndexReg = MI->getOperand(Op+2);
+ const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+ O << "[";
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOp(BaseReg, Modifier);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << "*";
+ printOp(IndexReg, Modifier);
+ NeedPlus = true;
+ }
+
+ if (DispSpec.isGlobal() || DispSpec.isCPI() ||
+ DispSpec.isJTI()) {
+ if (NeedPlus)
+ O << " + ";
+ printOp(DispSpec, "mem");
+ } else {
+ int DispVal = DispSpec.getImm();
+ if (DispVal || (!BaseReg.getReg() && !IndexReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << DispVal;
+ }
+ }
+ O << "]";
+}
+
+void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier) {
+ assert(isMem(MI, Op) && "Invalid memory reference!");
+ MachineOperand Segment = MI->getOperand(Op+4);
+ if (Segment.getReg()) {
+ printOperand(MI, Op+4, Modifier);
+ O << ':';
+ }
+ printLeaMemReference(MI, Op, Modifier);
+}
+
+void X86IntelAsmPrinter::printPICJumpTableSetLabel(unsigned uid,
+ const MachineBasicBlock *MBB) const {
+ if (!TAI->getSetDirective())
+ return;
+
+ O << TAI->getSetDirective() << ' ' << TAI->getPrivateGlobalPrefix()
+ << getFunctionNumber() << '_' << uid << "_set_" << MBB->getNumber() << ',';
+ printBasicBlockLabel(MBB, false, false, false);
+ O << '-' << "\"L" << getFunctionNumber() << "$pb\"'\n";
+}
+
+void X86IntelAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+ O << "\"L" << getFunctionNumber() << "$pb\"\n";
+ O << "\"L" << getFunctionNumber() << "$pb\":";
+}
+
+bool X86IntelAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+ const char Mode) {
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default: return true; // Unknown mode.
+ case 'b': // Print QImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8);
+ break;
+ case 'h': // Print QImode high register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+ break;
+ case 'w': // Print HImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i16);
+ break;
+ case 'k': // Print SImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i32);
+ break;
+ }
+
+ O << '%' << TRI->getName(Reg);
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86IntelAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+ }
+ }
+
+ printOperand(MI, OpNo);
+ return false;
+}
+
+bool X86IntelAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode) {
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier.
+ printMemReference(MI, OpNo);
+ return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction
+/// MI in Intel syntax to the current output stream.
+///
+void X86IntelAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+ ++EmittedInsts;
+
+ // Call the autogenerated instruction printer routines.
+ printInstruction(MI);
+}
+
+bool X86IntelAsmPrinter::doInitialization(Module &M) {
+ bool Result = AsmPrinter::doInitialization(M);
+
+ Mang->markCharUnacceptable('.');
+
+ O << "\t.686\n\t.model flat\n\n";
+
+ // Emit declarations for external functions.
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+ if (I->isDeclaration()) {
+ std::string Name = Mang->getValueName(I);
+ decorateName(Name, I);
+
+ O << "\textern " ;
+ if (I->hasDLLImportLinkage()) {
+ O << "__imp_";
+ }
+ O << Name << ":near\n";
+ }
+
+ // Emit declarations for external globals. Note that VC++ always declares
+ // external globals to have type byte, and if that's good enough for VC++...
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ if (I->isDeclaration()) {
+ std::string Name = Mang->getValueName(I);
+
+ O << "\textern " ;
+ if (I->hasDLLImportLinkage()) {
+ O << "__imp_";
+ }
+ O << Name << ":byte\n";
+ }
+ }
+
+ return Result;
+}
+
+bool X86IntelAsmPrinter::doFinalization(Module &M) {
+ const TargetData *TD = TM.getTargetData();
+
+ // Print out module-level global variables here.
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ if (I->isDeclaration()) continue; // External global require no code
+
+ // Check to see if this is a special global used by LLVM, if so, emit it.
+ if (EmitSpecialLLVMGlobal(I))
+ continue;
+
+ std::string name = Mang->getValueName(I);
+ Constant *C = I->getInitializer();
+ unsigned Align = TD->getPreferredAlignmentLog(I);
+ bool bCustomSegment = false;
+
+ switch (I->getLinkage()) {
+ case GlobalValue::CommonLinkage:
+ case GlobalValue::LinkOnceAnyLinkage:
+ case GlobalValue::LinkOnceODRLinkage:
+ case GlobalValue::WeakAnyLinkage:
+ case GlobalValue::WeakODRLinkage:
+ SwitchToDataSection("");
+ O << name << "?\tsegment common 'COMMON'\n";
+ bCustomSegment = true;
+ // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+ // are also available.
+ break;
+ case GlobalValue::AppendingLinkage:
+ SwitchToDataSection("");
+ O << name << "?\tsegment public 'DATA'\n";
+ bCustomSegment = true;
+ // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+ // are also available.
+ break;
+ case GlobalValue::DLLExportLinkage:
+ DLLExportedGVs.insert(name);
+ // FALL THROUGH
+ case GlobalValue::ExternalLinkage:
+ O << "\tpublic " << name << "\n";
+ // FALL THROUGH
+ case GlobalValue::InternalLinkage:
+ SwitchToSection(TAI->getDataSection());
+ break;
+ default:
+ assert(0 && "Unknown linkage type!");
+ }
+
+ if (!bCustomSegment)
+ EmitAlignment(Align, I);
+
+ O << name << ":";
+ if (VerboseAsm)
+ O << "\t\t\t\t" << TAI->getCommentString()
+ << " " << I->getName();
+ O << '\n';
+
+ EmitGlobalConstant(C);
+
+ if (bCustomSegment)
+ O << name << "?\tends\n";
+ }
+
+ // Output linker support code for dllexported globals
+ if (!DLLExportedGVs.empty() || !DLLExportedFns.empty()) {
+ SwitchToDataSection("");
+ O << "; WARNING: The following code is valid only with MASM v8.x"
+ << "and (possible) higher\n"
+ << "; This version of MASM is usually shipped with Microsoft "
+ << "Visual Studio 2005\n"
+ << "; or (possible) further versions. Unfortunately, there is no "
+ << "way to support\n"
+ << "; dllexported symbols in the earlier versions of MASM in fully "
+ << "automatic way\n\n";
+ O << "_drectve\t segment info alias('.drectve')\n";
+ }
+
+ for (StringSet<>::iterator i = DLLExportedGVs.begin(),
+ e = DLLExportedGVs.end();
+ i != e; ++i)
+ O << "\t db ' /EXPORT:" << i->getKeyData() << ",data'\n";
+
+ for (StringSet<>::iterator i = DLLExportedFns.begin(),
+ e = DLLExportedFns.end();
+ i != e; ++i)
+ O << "\t db ' /EXPORT:" << i->getKeyData() << "'\n";
+
+ if (!DLLExportedGVs.empty() || !DLLExportedFns.empty())
+ O << "_drectve\t ends\n";
+
+ // Bypass X86SharedAsmPrinter::doFinalization().
+ bool Result = AsmPrinter::doFinalization(M);
+ SwitchToDataSection("");
+ O << "\tend\n";
+ return Result;
+}
+
+void X86IntelAsmPrinter::EmitString(const ConstantArray *CVA) const {
+ unsigned NumElts = CVA->getNumOperands();
+ if (NumElts) {
+ // ML does not have escape sequences except '' for '. It also has a maximum
+ // string length of 255.
+ unsigned len = 0;
+ bool inString = false;
+ for (unsigned i = 0; i < NumElts; i++) {
+ int n = cast<ConstantInt>(CVA->getOperand(i))->getZExtValue() & 255;
+ if (len == 0)
+ O << "\tdb ";
+
+ if (n >= 32 && n <= 127) {
+ if (!inString) {
+ if (len > 0) {
+ O << ",'";
+ len += 2;
+ } else {
+ O << "'";
+ len++;
+ }
+ inString = true;
+ }
+ if (n == '\'') {
+ O << "'";
+ len++;
+ }
+ O << char(n);
+ } else {
+ if (inString) {
+ O << "'";
+ len++;
+ inString = false;
+ }
+ if (len > 0) {
+ O << ",";
+ len++;
+ }
+ O << n;
+ len += 1 + (n > 9) + (n > 99);
+ }
+
+ if (len > 60) {
+ if (inString) {
+ O << "'";
+ inString = false;
+ }
+ O << "\n";
+ len = 0;
+ }
+ }
+
+ if (len > 0) {
+ if (inString)
+ O << "'";
+ O << "\n";
+ }
+ }
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter1.inc"
diff --git a/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.h b/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.h
new file mode 100644
index 0000000..9520d98
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86IntelAsmPrinter.h
@@ -0,0 +1,152 @@
+//===-- X86IntelAsmPrinter.h - Convert X86 LLVM code to Intel assembly ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Intel assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INTELASMPRINTER_H
+#define X86INTELASMPRINTER_H
+
+#include "../X86.h"
+#include "../X86MachineFunctionInfo.h"
+#include "../X86TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+struct VISIBILITY_HIDDEN X86IntelAsmPrinter : public AsmPrinter {
+ explicit X86IntelAsmPrinter(raw_ostream &O, X86TargetMachine &TM,
+ const TargetAsmInfo *T, CodeGenOpt::Level OL,
+ bool V)
+ : AsmPrinter(O, TM, T, OL, V) {}
+
+ virtual const char *getPassName() const {
+ return "X86 Intel-Style Assembly Printer";
+ }
+
+ /// printInstruction - This method is automatically generated by tablegen
+ /// from the instruction set description. This method returns true if the
+ /// machine instruction was sufficiently described to print it, otherwise it
+ /// returns false.
+ bool printInstruction(const MachineInstr *MI);
+
+ // This method is used by the tablegen'erated instruction printer.
+ void printOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *Modifier = 0) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ if (MO.isReg()) {
+ assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+ "Not physreg??");
+ O << TM.getRegisterInfo()->get(MO.getReg()).Name; // Capitalized names
+ } else {
+ printOp(MO, Modifier);
+ }
+ }
+
+ void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "BYTE PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "WORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "DWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "QWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "XMMWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "DWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "QWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printf80mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "XWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "XMMWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printlea32mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "DWORD PTR ";
+ printLeaMemReference(MI, OpNo);
+ }
+ void printlea64mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "QWORD PTR ";
+ printLeaMemReference(MI, OpNo);
+ }
+ void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "QWORD PTR ";
+ printLeaMemReference(MI, OpNo, "subreg64");
+ }
+
+ bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode);
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode);
+ void printMachineInstruction(const MachineInstr *MI);
+ void printOp(const MachineOperand &MO, const char *Modifier = 0);
+ void printSSECC(const MachineInstr *MI, unsigned Op);
+ void printMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier=NULL);
+ void printLeaMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier=NULL);
+ void printPICJumpTableSetLabel(unsigned uid,
+ const MachineBasicBlock *MBB) const;
+ void printPICJumpTableSetLabel(unsigned uid, unsigned uid2,
+ const MachineBasicBlock *MBB) const {
+ AsmPrinter::printPICJumpTableSetLabel(uid, uid2, MBB);
+ }
+ void printPICLabel(const MachineInstr *MI, unsigned Op);
+ bool runOnMachineFunction(MachineFunction &F);
+ bool doInitialization(Module &M);
+ bool doFinalization(Module &M);
+
+ // We have to propagate some information about MachineFunction to
+ // AsmPrinter. It's ok, when we're printing the function, since we have
+ // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+ // Unfortunately, this is not possible when we're printing reference to
+ // Function (e.g. calling it and so on). Even more, there is no way to get the
+ // corresponding MachineFunctions: it can even be not created at all. That's
+ // why we should use additional structure, when we're collecting all necessary
+ // information.
+ //
+ // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+ // function, since we have to use arguments' size for decoration.
+ typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+ FMFInfoMap FunctionInfoMap;
+
+ void decorateName(std::string& Name, const GlobalValue* GV);
+
+ virtual void EmitString(const ConstantArray *CVA) const;
+
+ // Necessary for dllexport support
+ StringSet<> DLLExportedFns, DLLExportedGVs;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
new file mode 100644
index 0000000..d982990
--- /dev/null
+++ b/lib/Target/X86/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(LLVM_TARGET_DEFINITIONS X86.td)
+
+tablegen(X86GenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(X86GenRegisterNames.inc -gen-register-enums)
+tablegen(X86GenRegisterInfo.inc -gen-register-desc)
+tablegen(X86GenInstrNames.inc -gen-instr-enums)
+tablegen(X86GenInstrInfo.inc -gen-instr-desc)
+tablegen(X86GenAsmWriter.inc -gen-asm-writer)
+tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(X86GenDAGISel.inc -gen-dag-isel)
+tablegen(X86GenFastISel.inc -gen-fast-isel)
+tablegen(X86GenCallingConv.inc -gen-callingconv)
+tablegen(X86GenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(X86CodeGen
+ X86CodeEmitter.cpp
+ X86ELFWriterInfo.cpp
+ X86FloatingPoint.cpp
+ X86FloatingPointRegKill.cpp
+ X86ISelDAGToDAG.cpp
+ X86ISelLowering.cpp
+ X86InstrInfo.cpp
+ X86JITInfo.cpp
+ X86RegisterInfo.cpp
+ X86Subtarget.cpp
+ X86TargetAsmInfo.cpp
+ X86TargetMachine.cpp
+ X86FastISel.cpp
+ )
diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile
new file mode 100644
index 0000000..44f1c5d
--- /dev/null
+++ b/lib/Target/X86/Makefile
@@ -0,0 +1,23 @@
+##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMX86CodeGen
+TARGET = X86
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \
+ X86GenRegisterInfo.inc X86GenInstrNames.inc \
+ X86GenInstrInfo.inc X86GenAsmWriter.inc \
+ X86GenAsmWriter1.inc X86GenDAGISel.inc \
+ X86GenFastISel.inc \
+ X86GenCallingConv.inc X86GenSubtarget.inc
+
+DIRS = AsmPrinter
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/README-FPStack.txt b/lib/Target/X86/README-FPStack.txt
new file mode 100644
index 0000000..be28e8b
--- /dev/null
+++ b/lib/Target/X86/README-FPStack.txt
@@ -0,0 +1,85 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: FP stack related stuff
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Some targets (e.g. athlons) prefer freep to fstp ST(0):
+http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
+
+//===---------------------------------------------------------------------===//
+
+This should use fiadd on chips where it is profitable:
+double foo(double P, int *I) { return P+*I; }
+
+We have fiadd patterns now but the followings have the same cost and
+complexity. We need a way to specify the later is more profitable.
+
+def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP:$dst, (fadd RFP:$src1,
+ (extloadf64f32 addr:$src2)))]>;
+ // ST(0) = ST(0) + [mem32]
+
+def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
+ [(set RFP:$dst, (fadd RFP:$src1,
+ (X86fild addr:$src2, i32)))]>;
+ // ST(0) = ST(0) + [mem32int]
+
+//===---------------------------------------------------------------------===//
+
+The FP stackifier needs to be global. Also, it should handle simple permutates
+to reduce number of shuffle instructions, e.g. turning:
+
+fld P -> fld Q
+fld Q fld P
+fxch
+
+or:
+
+fxch -> fucomi
+fucomi jl X
+jg X
+
+Ideas:
+http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
+
+
+//===---------------------------------------------------------------------===//
+
+Add a target specific hook to DAG combiner to handle SINT_TO_FP and
+FP_TO_SINT when the source operand is already in memory.
+
+//===---------------------------------------------------------------------===//
+
+Open code rint,floor,ceil,trunc:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
+
+Opencode the sincos[f] libcall.
+
+//===---------------------------------------------------------------------===//
+
+None of the FPStack instructions are handled in
+X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
+folding spill code into the instructions.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl (%esp), %eax
+ addl $20, %esp
+ ret
+
+This just requires being smarter when custom expanding fptoui.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README-MMX.txt b/lib/Target/X86/README-MMX.txt
new file mode 100644
index 0000000..a6c8616
--- /dev/null
+++ b/lib/Target/X86/README-MMX.txt
@@ -0,0 +1,71 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: MMX-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+#include <mmintrin.h>
+
+__v2si qux(int A) {
+ return (__v2si){ 0, A };
+}
+
+is compiled into:
+
+_qux:
+ subl $28, %esp
+ movl 32(%esp), %eax
+ movd %eax, %mm0
+ movq %mm0, (%esp)
+ movl (%esp), %eax
+ movl %eax, 20(%esp)
+ movq %mm0, 8(%esp)
+ movl 12(%esp), %eax
+ movl %eax, 16(%esp)
+ movq 16(%esp), %mm0
+ addl $28, %esp
+ ret
+
+Yuck!
+
+GCC gives us:
+
+_qux:
+ subl $12, %esp
+ movl 16(%esp), %eax
+ movl 20(%esp), %edx
+ movl $0, (%eax)
+ movl %edx, 4(%eax)
+ addl $12, %esp
+ ret $4
+
+//===---------------------------------------------------------------------===//
+
+We generate crappy code for this:
+
+__m64 t() {
+ return _mm_cvtsi32_si64(1);
+}
+
+_t:
+ subl $12, %esp
+ movl $1, %eax
+ movd %eax, %mm0
+ movq %mm0, (%esp)
+ movl (%esp), %eax
+ movl 4(%esp), %edx
+ addl $12, %esp
+ ret
+
+The extra stack traffic is covered in the previous entry. But the other reason
+is we are not smart about materializing constants in MMX registers. With -m64
+
+ movl $1, %eax
+ movd %eax, %mm0
+ movd %mm0, %rax
+ ret
+
+We should be using a constantpool load instead:
+ movq LC0(%rip), %rax
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
new file mode 100644
index 0000000..71ad51c
--- /dev/null
+++ b/lib/Target/X86/README-SSE.txt
@@ -0,0 +1,918 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: SSE-specific stuff.
+//===---------------------------------------------------------------------===//
+
+- Consider eliminating the unaligned SSE load intrinsics, replacing them with
+ unaligned LLVM load instructions.
+
+//===---------------------------------------------------------------------===//
+
+Expand libm rounding functions inline: Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
+When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
+other fast SSE modes.
+
+//===---------------------------------------------------------------------===//
+
+Think about doing i64 math in SSE regs on x86-32.
+
+//===---------------------------------------------------------------------===//
+
+This testcase should have no SSE instructions in it, and only one load from
+a constant pool:
+
+double %test3(bool %B) {
+ %C = select bool %B, double 123.412, double 523.01123123
+ ret double %C
+}
+
+Currently, the select is being lowered, which prevents the dag combiner from
+turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
+
+The pattern isel got this one right.
+
+//===---------------------------------------------------------------------===//
+
+SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
+like this:
+
+ X += y
+
+and the register allocator decides to spill X, it is cheaper to emit this as:
+
+Y += [xslot]
+store Y -> [xslot]
+
+than as:
+
+tmp = [xslot]
+tmp += y
+store tmp -> [xslot]
+
+..and this uses one fewer register (so this should be done at load folding
+time, not at spiller time). *Note* however that this can only be done
+if Y is dead. Here's a testcase:
+
+@.str_3 = external global [15 x i8]
+declare void @printf(i32, ...)
+define void @main() {
+build_tree.exit:
+ br label %no_exit.i7
+
+no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
+ %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ],
+ [ %tmp.34.i18, %no_exit.i7 ]
+ %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
+ [ %tmp.28.i16, %no_exit.i7 ]
+ %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
+ %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
+ br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
+
+Compute_Tree.exit23: ; preds = %no_exit.i7
+ tail call void (i32, ...)* @printf( i32 0 )
+ store double %tmp.34.i18, double* null
+ ret void
+}
+
+We currently emit:
+
+.BBmain_1:
+ xorpd %XMM1, %XMM1
+ addsd %XMM0, %XMM1
+*** movsd %XMM2, QWORD PTR [%ESP + 8]
+*** addsd %XMM2, %XMM1
+*** movsd QWORD PTR [%ESP + 8], %XMM2
+ jmp .BBmain_1 # no_exit.i7
+
+This is a bugpoint reduced testcase, which is why the testcase doesn't make
+much sense (e.g. its an infinite loop). :)
+
+//===---------------------------------------------------------------------===//
+
+SSE should implement 'select_cc' using 'emulated conditional moves' that use
+pcmp/pand/pandn/por to do a selection instead of a conditional branch:
+
+double %X(double %Y, double %Z, double %A, double %B) {
+ %C = setlt double %A, %B
+ %z = add double %Z, 0.0 ;; select operand is not a load
+ %D = select bool %C, double %Y, double %z
+ ret double %D
+}
+
+We currently emit:
+
+_X:
+ subl $12, %esp
+ xorpd %xmm0, %xmm0
+ addsd 24(%esp), %xmm0
+ movsd 32(%esp), %xmm1
+ movsd 16(%esp), %xmm2
+ ucomisd 40(%esp), %xmm1
+ jb LBB_X_2
+LBB_X_1:
+ movsd %xmm0, %xmm2
+LBB_X_2:
+ movsd %xmm2, (%esp)
+ fldl (%esp)
+ addl $12, %esp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+It's not clear whether we should use pxor or xorps / xorpd to clear XMM
+registers. The choice may depend on subtarget information. We should do some
+more experiments on different x86 machines.
+
+//===---------------------------------------------------------------------===//
+
+Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
+feasible.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+ if (copysign(1.0, x) == copysign(1.0, y))
+into:
+ if (x^y & mask)
+when using SSE.
+
+//===---------------------------------------------------------------------===//
+
+Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
+of a v4sf value.
+
+//===---------------------------------------------------------------------===//
+
+Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
+Perhaps use pxor / xorp* to clear a XMM register first?
+
+//===---------------------------------------------------------------------===//
+
+How to decide when to use the "floating point version" of logical ops? Here are
+some code fragments:
+
+ movaps LCPI5_5, %xmm2
+ divps %xmm1, %xmm2
+ mulps %xmm2, %xmm3
+ mulps 8656(%ecx), %xmm3
+ addps 8672(%ecx), %xmm3
+ andps LCPI5_6, %xmm2
+ andps LCPI5_1, %xmm3
+ por %xmm2, %xmm3
+ movdqa %xmm3, (%edi)
+
+ movaps LCPI5_5, %xmm1
+ divps %xmm0, %xmm1
+ mulps %xmm1, %xmm3
+ mulps 8656(%ecx), %xmm3
+ addps 8672(%ecx), %xmm3
+ andps LCPI5_6, %xmm1
+ andps LCPI5_1, %xmm3
+ orps %xmm1, %xmm3
+ movaps %xmm3, 112(%esp)
+ movaps %xmm3, (%ebx)
+
+Due to some minor source change, the later case ended up using orps and movaps
+instead of por and movdqa. Does it matter?
+
+//===---------------------------------------------------------------------===//
+
+X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
+to choose between movaps, movapd, and movdqa based on types of source and
+destination?
+
+How about andps, andpd, and pand? Do we really care about the type of the packed
+elements? If not, why not always use the "ps" variants which are likely to be
+shorter.
+
+//===---------------------------------------------------------------------===//
+
+External test Nurbs exposed some problems. Look for
+__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
+emits:
+
+ movaps (%edx), %xmm2 #59.21
+ movaps (%edx), %xmm5 #60.21
+ movaps (%edx), %xmm4 #61.21
+ movaps (%edx), %xmm3 #62.21
+ movl 40(%ecx), %ebp #69.49
+ shufps $0, %xmm2, %xmm5 #60.21
+ movl 100(%esp), %ebx #69.20
+ movl (%ebx), %edi #69.20
+ imull %ebp, %edi #69.49
+ addl (%eax), %edi #70.33
+ shufps $85, %xmm2, %xmm4 #61.21
+ shufps $170, %xmm2, %xmm3 #62.21
+ shufps $255, %xmm2, %xmm2 #63.21
+ lea (%ebp,%ebp,2), %ebx #69.49
+ negl %ebx #69.49
+ lea -3(%edi,%ebx), %ebx #70.33
+ shll $4, %ebx #68.37
+ addl 32(%ecx), %ebx #68.37
+ testb $15, %bl #91.13
+ jne L_B1.24 # Prob 5% #91.13
+
+This is the llvm code after instruction scheduling:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+ %reg1078 = MOV32ri -3
+ %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
+ %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
+ %reg1080 = IMUL32rr %reg1079, %reg1037
+ %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
+ %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
+ %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
+ %reg1082 = SHL32ri %reg1038, 4
+ %reg1039 = ADD32rr %reg1036, %reg1082
+ %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
+ %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
+ %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
+ %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
+ %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
+ %reg1040 = MOV32rr %reg1039
+ %reg1084 = AND32ri8 %reg1039, 15
+ CMP32ri8 %reg1084, 0
+ JE mbb<cond_next204,0xa914d30>
+
+Still ok. After register allocation:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+ %EAX = MOV32ri -3
+ %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
+ ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
+ %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
+ %EDX = MOV32rm %EDX, 1, %NOREG, 40
+ IMUL32rr %EAX<def&use>, %EDX
+ %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
+ %ESI = MOV32rm %ESI, 1, %NOREG, 0
+ MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
+ %EAX = LEA32r %ESI, 1, %EAX, -3
+ %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
+ %ESI = MOV32rm %ESI, 1, %NOREG, 32
+ %EDI = MOV32rr %EAX
+ SHL32ri %EDI<def&use>, 4
+ ADD32rr %EDI<def&use>, %ESI
+ %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
+ %XMM1 = MOVAPSrr %XMM0
+ SHUFPSrr %XMM1<def&use>, %XMM1, 170
+ %XMM2 = MOVAPSrr %XMM0
+ SHUFPSrr %XMM2<def&use>, %XMM2, 0
+ %XMM3 = MOVAPSrr %XMM0
+ SHUFPSrr %XMM3<def&use>, %XMM3, 255
+ SHUFPSrr %XMM0<def&use>, %XMM0, 85
+ %EBX = MOV32rr %EDI
+ AND32ri8 %EBX<def&use>, 15
+ CMP32ri8 %EBX, 0
+ JE mbb<cond_next204,0xa914d30>
+
+This looks really bad. The problem is shufps is a destructive opcode. Since it
+appears as operand two in more than one shufps ops. It resulted in a number of
+copies. Note icc also suffers from the same problem. Either the instruction
+selector should select pshufd or The register allocator can made the two-address
+to three-address transformation.
+
+It also exposes some other problems. See MOV32ri -3 and the spills.
+
+//===---------------------------------------------------------------------===//
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
+
+LLVM is producing bad code.
+
+LBB_main_4: # cond_true44
+ addps %xmm1, %xmm2
+ subps %xmm3, %xmm2
+ movaps (%ecx), %xmm4
+ movaps %xmm2, %xmm1
+ addps %xmm4, %xmm1
+ addl $16, %ecx
+ incl %edx
+ cmpl $262144, %edx
+ movaps %xmm3, %xmm2
+ movaps %xmm4, %xmm3
+ jne LBB_main_4 # cond_true44
+
+There are two problems. 1) No need to two loop induction variables. We can
+compare against 262144 * 16. 2) Known register coalescer issue. We should
+be able eliminate one of the movaps:
+
+ addps %xmm2, %xmm1 <=== Commute!
+ subps %xmm3, %xmm1
+ movaps (%ecx), %xmm4
+ movaps %xmm1, %xmm1 <=== Eliminate!
+ addps %xmm4, %xmm1
+ addl $16, %ecx
+ incl %edx
+ cmpl $262144, %edx
+ movaps %xmm3, %xmm2
+ movaps %xmm4, %xmm3
+ jne LBB_main_4 # cond_true44
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+__m128 test(float a) {
+ return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+}
+
+This compiles into:
+
+movss 4(%esp), %xmm1
+mulss %xmm1, %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Because mulss doesn't modify the top 3 elements, the top elements of
+xmm1 are already zero'd. We could compile this to:
+
+movss 4(%esp), %xmm0
+mulss %xmm0, %xmm0
+ret
+
+//===---------------------------------------------------------------------===//
+
+Here's a sick and twisted idea. Consider code like this:
+
+__m128 test(__m128 a) {
+ float b = *(float*)&A;
+ ...
+ return _mm_set_ps(0.0, 0.0, 0.0, b);
+}
+
+This might compile to this code:
+
+movaps c(%esp), %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Now consider if the ... code caused xmm1 to get spilled. This might produce
+this code:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+xorps %xmm0, %xmm0
+movaps c2(%esp), %xmm1
+movss %xmm1, %xmm0
+ret
+
+However, since the reload is only used by these instructions, we could
+"fold" it into the uses, producing something like this:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+movss c2(%esp), %xmm0
+ret
+
+... saving two instructions.
+
+The basic idea is that a reload from a spill slot, can, if only one 4-byte
+chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+This can be used to simplify a variety of shuffle operations, where the
+elements are fixed zeros.
+
+//===---------------------------------------------------------------------===//
+
+__m128d test1( __m128d A, __m128d B) {
+ return _mm_shuffle_pd(A, B, 0x3);
+}
+
+compiles to
+
+shufpd $3, %xmm1, %xmm0
+
+Perhaps it's better to use unpckhpd instead?
+
+unpckhpd %xmm1, %xmm0
+
+Don't know if unpckhpd is faster. But it is shorter.
+
+//===---------------------------------------------------------------------===//
+
+This code generates ugly code, probably due to costs being off or something:
+
+define void @test(float* %P, <4 x float>* %P2 ) {
+ %xFloat0.688 = load float* %P
+ %tmp = load <4 x float>* %P2
+ %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
+ store <4 x float> %inFloat3.713, <4 x float>* %P2
+ ret void
+}
+
+Generates:
+
+_test:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ pxor %xmm1, %xmm1
+ movaps %xmm0, %xmm2
+ shufps $50, %xmm1, %xmm2
+ shufps $132, %xmm2, %xmm0
+ movaps %xmm0, (%eax)
+ ret
+
+Would it be better to generate:
+
+_test:
+ movl 8(%esp), %ecx
+ movaps (%ecx), %xmm0
+ xor %eax, %eax
+ pinsrw $6, %eax, %xmm0
+ pinsrw $7, %eax, %xmm0
+ movaps %xmm0, (%ecx)
+ ret
+
+?
+
+//===---------------------------------------------------------------------===//
+
+Some useful information in the Apple Altivec / SSE Migration Guide:
+
+http://developer.apple.com/documentation/Performance/Conceptual/
+Accelerate_sse_migration/index.html
+
+e.g. SSE select using and, andnot, or. Various SSE compare translations.
+
+//===---------------------------------------------------------------------===//
+
+Add hooks to commute some CMPP operations.
+
+//===---------------------------------------------------------------------===//
+
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
+
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should materialize vector constants like "all ones" and "signbit" with
+code like:
+
+ cmpeqps xmm1, xmm1 ; xmm1 = all-ones
+
+and:
+ cmpeqps xmm1, xmm1 ; xmm1 = all-ones
+ psrlq xmm1, 31 ; xmm1 = all 100000000000...
+
+instead of using a load from the constant pool. The later is important for
+ABS/NEG/copysign etc.
+
+//===---------------------------------------------------------------------===//
+
+These functions:
+
+#include <xmmintrin.h>
+__m128i a;
+void x(unsigned short n) {
+ a = _mm_slli_epi32 (a, n);
+}
+void y(unsigned n) {
+ a = _mm_slli_epi32 (a, n);
+}
+
+compile to ( -O3 -static -fomit-frame-pointer):
+_x:
+ movzwl 4(%esp), %eax
+ movd %eax, %xmm0
+ movaps _a, %xmm1
+ pslld %xmm0, %xmm1
+ movaps %xmm1, _a
+ ret
+_y:
+ movd 4(%esp), %xmm0
+ movaps _a, %xmm1
+ pslld %xmm0, %xmm1
+ movaps %xmm1, _a
+ ret
+
+"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
+like movd would be sufficient in both cases as the value is already zero
+extended in the 32-bit stack slot IIRC. For signed short, it should also be
+save, as a really-signed value would be undefined for pslld.
+
+
+//===---------------------------------------------------------------------===//
+
+#include <math.h>
+int t1(double d) { return signbit(d); }
+
+This currently compiles to:
+ subl $12, %esp
+ movsd 16(%esp), %xmm0
+ movsd %xmm0, (%esp)
+ movl 4(%esp), %eax
+ shrl $31, %eax
+ addl $12, %esp
+ ret
+
+We should use movmskp{s|d} instead.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
+(aligned) vector load. This functionality has a couple of problems.
+
+1. The code to infer alignment from loads of globals is in the X86 backend,
+ not the dag combiner. This is because dagcombine2 needs to be able to see
+ through the X86ISD::Wrapper node, which DAGCombine can't really do.
+2. The code for turning 4 x load into a single vector load is target
+ independent and should be moved to the dag combiner.
+3. The code for turning 4 x load into a vector load can only handle a direct
+ load from a global or a direct load from the stack. It should be generalized
+ to handle any load from P, P+4, P+8, P+12, where P can be anything.
+4. The alignment inference code cannot handle loads from globals in non-static
+ mode because it doesn't look through the extra dyld stub load. If you try
+ vec_align.ll without -relocation-model=static, you'll see what I mean.
+
+//===---------------------------------------------------------------------===//
+
+We should lower store(fneg(load p), q) into an integer load+xor+store, which
+eliminates a constant pool load. For example, consider:
+
+define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
+entry:
+ %tmp6 = sub float -0.000000e+00, %z.1 ; <float> [#uses=1]
+ %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
+ ret i64 %tmp20
+}
+
+This currently compiles to:
+
+LCPI1_0: # <4 x float>
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+_ccosf:
+ subl $12, %esp
+ movss 16(%esp), %xmm0
+ movss %xmm0, 4(%esp)
+ movss 20(%esp), %xmm0
+ xorps LCPI1_0, %xmm0
+ movss %xmm0, (%esp)
+ call L_ccoshf$stub
+ addl $12, %esp
+ ret
+
+Note the load into xmm0, then xor (to negate), then store. In PIC mode,
+this code computes the pic base and does two loads to do the constant pool
+load, so the improvement is much bigger.
+
+The tricky part about this xform is that the argument load/store isn't exposed
+until post-legalize, and at that point, the fneg has been custom expanded into
+an X86 fxor. This means that we need to handle this case in the x86 backend
+instead of in target independent code.
+
+//===---------------------------------------------------------------------===//
+
+Non-SSE4 insert into 16 x i8 is atrociously bad.
+
+//===---------------------------------------------------------------------===//
+
+<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
+is memory.
+
+//===---------------------------------------------------------------------===//
+
+SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
+sitting between the truncate and the extract.
+
+//===---------------------------------------------------------------------===//
+
+INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
+any number of 0.0 simultaneously. Currently we only use it for simple
+insertions.
+
+See comments in LowerINSERT_VECTOR_ELT_SSE4.
+
+//===---------------------------------------------------------------------===//
+
+On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
+Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
+legal, it'll just take a few extra patterns written in the .td file.
+
+Note: this is not a code quality issue; the custom lowered code happens to be
+right, but we shouldn't have to custom lower anything. This is probably related
+to <2 x i64> ops being so bad.
+
+//===---------------------------------------------------------------------===//
+
+'select' on vectors and scalars could be a whole lot better. We currently
+lower them to conditional branches. On x86-64 for example, we compile this:
+
+double test(double a, double b, double c, double d) { return a<b ? c : d; }
+
+to:
+
+_test:
+ ucomisd %xmm0, %xmm1
+ ja LBB1_2 # entry
+LBB1_1: # entry
+ movapd %xmm3, %xmm2
+LBB1_2: # entry
+ movapd %xmm2, %xmm0
+ ret
+
+instead of:
+
+_test:
+ cmpltsd %xmm1, %xmm0
+ andpd %xmm0, %xmm2
+ andnpd %xmm3, %xmm0
+ orpd %xmm2, %xmm0
+ ret
+
+For unpredictable branches, the later is much more efficient. This should
+just be a matter of having scalar sse map to SELECT_CC and custom expanding
+or iseling it.
+
+//===---------------------------------------------------------------------===//
+
+LLVM currently generates stack realignment code, when it is not necessary
+needed. The problem is that we need to know about stack alignment too early,
+before RA runs.
+
+At that point we don't know, whether there will be vector spill, or not.
+Stack realignment logic is overly conservative here, but otherwise we can
+produce unaligned loads/stores.
+
+Fixing this will require some huge RA changes.
+
+Testcase:
+#include <emmintrin.h>
+
+typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
+
+static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
+- 22725, - 12873};;
+
+vSInt16 madd(vSInt16 b)
+{
+ return _mm_madd_epi16(a, b);
+}
+
+Generated code (x86-32, linux):
+madd:
+ pushl %ebp
+ movl %esp, %ebp
+ andl $-16, %esp
+ movaps .LCPI1_0, %xmm1
+ pmaddwd %xmm1, %xmm0
+ movl %ebp, %esp
+ popl %ebp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+#include <emmintrin.h>
+__m128 foo2 (float x) {
+ return _mm_set_ps (0, 0, x, 0);
+}
+
+In x86-32 mode, we generate this spiffy code:
+
+_foo2:
+ movss 4(%esp), %xmm0
+ pshufd $81, %xmm0, %xmm0
+ ret
+
+in x86-64 mode, we generate this code, which could be better:
+
+_foo2:
+ xorps %xmm1, %xmm1
+ movss %xmm0, %xmm1
+ pshufd $81, %xmm1, %xmm0
+ ret
+
+In sse4 mode, we could use insertps to make both better.
+
+Here's another testcase that could use insertps [mem]:
+
+#include <xmmintrin.h>
+extern float x2, x3;
+__m128 foo1 (float x1, float x4) {
+ return _mm_set_ps (x2, x1, x3, x4);
+}
+
+gcc mainline compiles it to:
+
+foo1:
+ insertps $0x10, x2(%rip), %xmm0
+ insertps $0x10, x3(%rip), %xmm1
+ movaps %xmm1, %xmm2
+ movlhps %xmm0, %xmm2
+ movaps %xmm2, %xmm0
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We compile vector multiply-by-constant into poor code:
+
+define <4 x i32> @f(<4 x i32> %i) nounwind {
+ %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
+ ret <4 x i32> %A
+}
+
+On targets without SSE4.1, this compiles into:
+
+LCPI1_0: ## <4 x i32>
+ .long 10
+ .long 10
+ .long 10
+ .long 10
+ .text
+ .align 4,0x90
+ .globl _f
+_f:
+ pshufd $3, %xmm0, %xmm1
+ movd %xmm1, %eax
+ imull LCPI1_0+12, %eax
+ movd %eax, %xmm1
+ pshufd $1, %xmm0, %xmm2
+ movd %xmm2, %eax
+ imull LCPI1_0+4, %eax
+ movd %eax, %xmm2
+ punpckldq %xmm1, %xmm2
+ movd %xmm0, %eax
+ imull LCPI1_0, %eax
+ movd %eax, %xmm1
+ movhlps %xmm0, %xmm0
+ movd %xmm0, %eax
+ imull LCPI1_0+8, %eax
+ movd %eax, %xmm0
+ punpckldq %xmm0, %xmm1
+ movaps %xmm1, %xmm0
+ punpckldq %xmm2, %xmm0
+ ret
+
+It would be better to synthesize integer vector multiplication by constants
+using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
+simple cases such as multiplication by powers of two would be better as
+vector shifts than as multiplications.
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+__m128i
+foo2 (char x)
+{
+ return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
+}
+
+into:
+ movl $1, %eax
+ xorps %xmm0, %xmm0
+ pinsrw $2, %eax, %xmm0
+ movzbl 4(%esp), %eax
+ pinsrw $3, %eax, %xmm0
+ movl $256, %eax
+ pinsrw $7, %eax, %xmm0
+ ret
+
+
+gcc-4.2:
+ subl $12, %esp
+ movzbl 16(%esp), %eax
+ movdqa LC0, %xmm0
+ pinsrw $3, %eax, %xmm0
+ addl $12, %esp
+ ret
+ .const
+ .align 4
+LC0:
+ .word 0
+ .word 0
+ .word 1
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+ .word 256
+
+With SSE4, it should be
+ movdqa .LC0(%rip), %xmm0
+ pinsrb $6, %edi, %xmm0
+
+//===---------------------------------------------------------------------===//
+
+We should transform a shuffle of two vectors of constants into a single vector
+of constants. Also, insertelement of a constant into a vector of constants
+should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
+
+We compiled it to something horrible:
+
+ .align 4
+LCPI1_1: ## float
+ .long 1065353216 ## float 1
+ .const
+
+ .align 4
+LCPI1_0: ## <4 x float>
+ .space 4
+ .long 1065353216 ## float 1
+ .space 4
+ .long 1065353216 ## float 1
+ .text
+ .align 4,0x90
+ .globl _t
+_t:
+ xorps %xmm0, %xmm0
+ movhps LCPI1_0, %xmm0
+ movss LCPI1_1, %xmm1
+ movaps %xmm0, %xmm2
+ shufps $2, %xmm1, %xmm2
+ shufps $132, %xmm2, %xmm0
+ movaps %xmm0, 0
+
+//===---------------------------------------------------------------------===//
+rdar://5907648
+
+This function:
+
+float foo(unsigned char x) {
+ return x;
+}
+
+compiles to (x86-32):
+
+define float @foo(i8 zeroext %x) nounwind {
+ %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
+ ret float %tmp12
+}
+
+compiles to:
+
+_foo:
+ subl $4, %esp
+ movzbl 8(%esp), %eax
+ cvtsi2ss %eax, %xmm0
+ movss %xmm0, (%esp)
+ flds (%esp)
+ addl $4, %esp
+ ret
+
+We should be able to use:
+ cvtsi2ss 8($esp), %xmm0
+since we know the stack slot is already zext'd.
+
+//===---------------------------------------------------------------------===//
+
+Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
+when code size is critical. movlps is slower than movsd on core2 but it's one
+byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE. SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+ return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:
+ .long 1082130432 ## float 4.000000e+00
+_MonteCarlo_num_flops:
+ subl $4, %esp
+ movl 8(%esp), %eax
+ movl %eax, (%esp)
+ fildl (%esp)
+ fmuls LCPI1_0
+ addl $4, %esp
+ ret
+
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+ subl $12, %esp
+ cvtsi2sd 16(%esp), %xmm0
+ mulsd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+ fldl (%esp)
+ addl $12, %esp
+ ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README-UNIMPLEMENTED.txt b/lib/Target/X86/README-UNIMPLEMENTED.txt
new file mode 100644
index 0000000..69dc8ee
--- /dev/null
+++ b/lib/Target/X86/README-UNIMPLEMENTED.txt
@@ -0,0 +1,14 @@
+//===---------------------------------------------------------------------===//
+// Testcases that crash the X86 backend because they aren't implemented
+//===---------------------------------------------------------------------===//
+
+These are cases we know the X86 backend doesn't handle. Patches are welcome
+and appreciated, because no one has signed up to implemented these yet.
+Implementing these would allow elimination of the corresponding intrinsics,
+which would be great.
+
+1) vector shifts
+2) vector comparisons
+3) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688
+4) bitcasts from vectors to scalars: PR2804
+
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
new file mode 100644
index 0000000..ad12137
--- /dev/null
+++ b/lib/Target/X86/README-X86-64.txt
@@ -0,0 +1,251 @@
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+Implement different PIC models? Right now we only support Mac OS X with small
+PIC code model.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+extern void xx(void);
+void bar(void) {
+ xx();
+}
+
+gcc compiles to:
+
+.globl _bar
+_bar:
+ jmp _xx
+
+We need to do the tailcall optimization as well.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+ ucomiss LC0(%rip), %xmm0
+ cvttss2siq %xmm0, %rdx
+ jb L3
+ subss LC0(%rip), %xmm0
+ movabsq $-9223372036854775808, %rax
+ cvttss2siq %xmm0, %rdx
+ xorq %rax, %rdx
+L3:
+ movq %rdx, %rax
+ ret
+
+instead of
+
+_conv:
+ movss LCPI1_0(%rip), %xmm1
+ cvttss2siq %xmm0, %rcx
+ movaps %xmm0, %xmm2
+ subss %xmm1, %xmm2
+ cvttss2siq %xmm2, %rax
+ movabsq $-9223372036854775808, %rdx
+ xorq %rdx, %rax
+ ucomiss %xmm1, %xmm0
+ cmovb %rcx, %rax
+ ret
+
+Seems like the jb branch has high likelyhood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen:
+
+int X[2];
+int b;
+void test(void) {
+ memset(X, b, 2*sizeof(X[0]));
+}
+
+llc:
+ movq _b@GOTPCREL(%rip), %rax
+ movzbq (%rax), %rax
+ movq %rax, %rcx
+ shlq $8, %rcx
+ orq %rax, %rcx
+ movq %rcx, %rax
+ shlq $16, %rax
+ orq %rcx, %rax
+ movq %rax, %rcx
+ shlq $32, %rcx
+ movq _X@GOTPCREL(%rip), %rdx
+ orq %rax, %rcx
+ movq %rcx, (%rdx)
+ ret
+
+gcc:
+ movq _b@GOTPCREL(%rip), %rax
+ movabsq $72340172838076673, %rdx
+ movzbq (%rax), %rax
+ imulq %rdx, %rax
+ movq _X@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Vararg function prologue can be further optimized. Currently all XMM registers
+are stored into register save area. Most of them can be eliminated since the
+upper bound of the number of XMM registers used are passed in %al. gcc produces
+something like the following:
+
+ movzbl %al, %edx
+ leaq 0(,%rdx,4), %rax
+ leaq 4+L2(%rip), %rdx
+ leaq 239(%rsp), %rax
+ jmp *%rdx
+ movaps %xmm7, -15(%rax)
+ movaps %xmm6, -31(%rax)
+ movaps %xmm5, -47(%rax)
+ movaps %xmm4, -63(%rax)
+ movaps %xmm3, -79(%rax)
+ movaps %xmm2, -95(%rax)
+ movaps %xmm1, -111(%rax)
+ movaps %xmm0, -127(%rax)
+L2:
+
+It jumps over the movaps that do not need to be stored. Hard to see this being
+significant as it added 5 instruciton (including a indirect branch) to avoid
+executing 0 to 8 stores in the function prologue.
+
+Perhaps we can optimize for the common case where no XMM registers are used for
+parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a
+leaf function where we can determine that no XMM input parameter is need, avoid
+emitting the stores at all.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 has a complex calling convention for aggregate passing by value:
+
+1. If the size of an object is larger than two eightbytes, or in C++, is a non-
+ POD structure or union type, or contains unaligned fields, it has class
+ MEMORY.
+2. Both eightbytes get initialized to class NO_CLASS.
+3. Each field of an object is classified recursively so that always two fields
+ are considered. The resulting class is calculated according to the classes
+ of the fields in the eightbyte:
+ (a) If both classes are equal, this is the resulting class.
+ (b) If one of the classes is NO_CLASS, the resulting class is the other
+ class.
+ (c) If one of the classes is MEMORY, the result is the MEMORY class.
+ (d) If one of the classes is INTEGER, the result is the INTEGER.
+ (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as
+ class.
+ (f) Otherwise class SSE is used.
+4. Then a post merger cleanup is done:
+ (a) If one of the classes is MEMORY, the whole argument is passed in memory.
+ (b) If SSEUP is not preceeded by SSE, it is converted to SSE.
+
+Currently llvm frontend does not handle this correctly.
+
+Problem 1:
+ typedef struct { int i; double d; } QuadWordS;
+It is currently passed in two i64 integer registers. However, gcc compiled
+callee expects the second element 'd' to be passed in XMM0.
+
+Problem 2:
+ typedef struct { int32_t i; float j; double d; } QuadWordS;
+The size of the first two fields == i64 so they will be combined and passed in
+a integer register RDI. The third field is still passed in XMM0.
+
+Problem 3:
+ typedef struct { int64_t i; int8_t j; int64_t d; } S;
+ void test(S s)
+The size of this aggregate is greater than two i64 so it should be passed in
+memory. Currently llvm breaks this down and passed it in three integer
+registers.
+
+Problem 4:
+Taking problem 3 one step ahead where a function expects a aggregate value
+in memory followed by more parameter(s) passed in register(s).
+ void test(S s, int b)
+
+LLVM IR does not allow parameter passing by aggregates, therefore it must break
+the aggregates value (in problem 3 and 4) into a number of scalar values:
+ void %test(long %s.i, byte %s.j, long %s.d);
+
+However, if the backend were to lower this code literally it would pass the 3
+values in integer registers. To force it be passed in memory, the frontend
+should change the function signiture to:
+ void %test(long %undef1, long %undef2, long %undef3, long %undef4,
+ long %undef5, long %undef6,
+ long %s.i, byte %s.j, long %s.d);
+And the callee would look something like this:
+ call void %test( undef, undef, undef, undef, undef, undef,
+ %tmp.s.i, %tmp.s.j, %tmp.s.d );
+The first 6 undef parameters would exhaust the 6 integer registers used for
+parameter passing. The following three integer values would then be forced into
+memory.
+
+For problem 4, the parameter 'd' would be moved to the front of the parameter
+list so it will be passed in register:
+ void %test(int %d,
+ long %undef1, long %undef2, long %undef3, long %undef4,
+ long %undef5, long %undef6,
+ long %s.i, byte %s.j, long %s.d);
+
+//===---------------------------------------------------------------------===//
+
+Right now the asm printer assumes GlobalAddress are accessed via RIP relative
+addressing. Therefore, it is not possible to generate this:
+ movabsq $__ZTV10polynomialIdE+16, %rax
+
+That is ok for now since we currently only support small model. So the above
+is selected as
+ leaq __ZTV10polynomialIdE+16(%rip), %rax
+
+This is probably slightly slower but is much shorter than movabsq. However, if
+we were to support medium or larger code models, we need to use the movabs
+instruction. We should probably introduce something like AbsoluteAddress to
+distinguish it from GlobalAddress so the asm printer and JIT code emitter can
+do the right thing.
+
+//===---------------------------------------------------------------------===//
+
+It's not possible to reference AH, BH, CH, and DH registers in an instruction
+requiring REX prefix. However, divb and mulb both produce results in AH. If isel
+emits a CopyFromReg which gets turned into a movb and that can be allocated a
+r8b - r15b.
+
+To get around this, isel emits a CopyFromReg from AX and then right shift it
+down by 8 and truncate it. It's not pretty but it works. We need some register
+allocation magic to make the hack go away (e.g. putting additional constraints
+on the result of the movb).
+
+//===---------------------------------------------------------------------===//
+
+The x86-64 ABI for hidden-argument struct returns requires that the
+incoming value of %rdi be copied into %rax by the callee upon return.
+
+The idea is that it saves callers from having to remember this value,
+which would often require a callee-saved register. Callees usually
+need to keep this value live for most of their body anyway, so it
+doesn't add a significant burden on them.
+
+We currently implement this in codegen, however this is suboptimal
+because it means that it would be quite awkward to implement the
+optimization for callers.
+
+A better implementation would be to relax the LLVM IR rules for sret
+arguments to allow a function with an sret argument to have a non-void
+return type, and to have the front-end to set up the sret argument value
+as the return value of the function. The front-end could more easily
+emit uses of the returned struct value to be in terms of the function's
+lowered return value, and it would free non-C frontends from a
+complication only required by a C-based ABI.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
new file mode 100644
index 0000000..710bd03
--- /dev/null
+++ b/lib/Target/X86/README.txt
@@ -0,0 +1,1899 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend.
+//===---------------------------------------------------------------------===//
+
+We should add support for the "movbe" instruction, which does a byte-swapping
+copy (3-addr bswap + memory support?) This is available on Atom processors.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case. We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
+
+//===---------------------------------------------------------------------===//
+
+This should be one DIV/IDIV instruction, not a libcall:
+
+unsigned test(unsigned long long X, unsigned Y) {
+ return X/Y;
+}
+
+This can be done trivially with a custom legalizer. What about overflow
+though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+
+//===---------------------------------------------------------------------===//
+
+Improvements to the multiply -> shift/add algorithm:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
+
+//===---------------------------------------------------------------------===//
+
+Improve code like this (occurs fairly frequently, e.g. in LLVM):
+long long foo(int x) { return 1LL << x; }
+
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
+
+Another useful one would be ~0ULL >> X and ~0ULL << X.
+
+One better solution for 1LL << x is:
+ xorl %eax, %eax
+ xorl %edx, %edx
+ testb $32, %cl
+ sete %al
+ setne %dl
+ sall %cl, %eax
+ sall %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+Also, this might be better. It's an extra shift, but it's one instruction
+shorter, and doesn't stress 8-bit subreg support.
+(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
+but without the unnecessary and.)
+ movl %ecx, %eax
+ shrl $5, %eax
+ movl %eax, %edx
+ xorl $1, %edx
+ sall %cl, %eax
+ sall %cl. %edx
+
+64-bit shifts (in general) expand to really bad code. Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
+//===---------------------------------------------------------------------===//
+
+Compile this:
+_Bool f(_Bool a) { return a!=1; }
+
+into:
+ movzbl %dil, %eax
+ xorl $1, %eax
+ ret
+
+(Although note that this isn't a legal way to express the code that llvm-gcc
+currently generates for that function.)
+
+//===---------------------------------------------------------------------===//
+
+Some isel ideas:
+
+1. Dynamic programming based approach when compile time if not an
+ issue.
+2. Code duplication (addressing mode) during isel.
+3. Other ideas from "Register-Sensitive Selection, Duplication, and
+ Sequencing of Instructions".
+4. Scheduling for reduced register pressure. E.g. "Minimum Register
+ Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
+ and other related papers.
+ http://citeseer.ist.psu.edu/govindarajan01minimum.html
+
+//===---------------------------------------------------------------------===//
+
+Should we promote i16 to i32 to avoid partial register update stalls?
+
+//===---------------------------------------------------------------------===//
+
+Leave any_extend as pseudo instruction and hint to register
+allocator. Delay codegen until post register allocation.
+Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
+the coalescer how to deal with it though.
+
+//===---------------------------------------------------------------------===//
+
+It appears icc use push for parameter passing. Need to investigate.
+
+//===---------------------------------------------------------------------===//
+
+Only use inc/neg/not instructions on processors where they are faster than
+add/sub/xor. They are slower on the P4 due to only updating some processor
+flags.
+
+//===---------------------------------------------------------------------===//
+
+The instruction selector sometimes misses folding a load into a compare. The
+pattern is written as (cmp reg, (load p)). Because the compare isn't
+commutative, it is not matched with the load on both sides. The dag combiner
+should be made smart enough to cannonicalize the load into the RHS of a compare
+when it can invert the result of the compare for free.
+
+//===---------------------------------------------------------------------===//
+
+How about intrinsics? An example is:
+ *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
+
+compiles to
+ pmuludq (%eax), %xmm0
+ movl 8(%esp), %eax
+ movdqa (%eax), %xmm1
+ pmulhuw %xmm0, %xmm1
+
+The transformation probably requires a X86 specific pass or a DAG combiner
+target specific hook.
+
+//===---------------------------------------------------------------------===//
+
+In many cases, LLVM generates code like this:
+
+_test:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setl %al
+ movzbl %al, %eax
+ ret
+
+on some processors (which ones?), it is more efficient to do this:
+
+_test:
+ movl 8(%esp), %ebx
+ xor %eax, %eax
+ cmpl %ebx, 4(%esp)
+ setl %al
+ ret
+
+Doing this correctly is tricky though, as the xor clobbers the flags.
+
+//===---------------------------------------------------------------------===//
+
+We should generate bts/btr/etc instructions on targets where they are cheap or
+when codesize is important. e.g., for:
+
+void setbit(int *target, int bit) {
+ *target |= (1 << bit);
+}
+void clearbit(int *target, int bit) {
+ *target &= ~(1 << bit);
+}
+
+//===---------------------------------------------------------------------===//
+
+Instead of the following for memset char*, 1, 10:
+
+ movl $16843009, 4(%edx)
+ movl $16843009, (%edx)
+ movw $257, 8(%edx)
+
+It might be better to generate
+
+ movl $16843009, %eax
+ movl %eax, 4(%edx)
+ movl %eax, (%edx)
+ movw al, 8(%edx)
+
+when we can spare a register. It reduces code size.
+
+//===---------------------------------------------------------------------===//
+
+Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
+get this:
+
+define i32 @test1(i32 %X) {
+ %Y = sdiv i32 %X, 8
+ ret i32 %Y
+}
+
+_test1:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ sarl $31, %ecx
+ shrl $29, %ecx
+ addl %ecx, %eax
+ sarl $3, %eax
+ ret
+
+GCC knows several different ways to codegen it, one of which is this:
+
+_test1:
+ movl 4(%esp), %eax
+ cmpl $-1, %eax
+ leal 7(%eax), %ecx
+ cmovle %ecx, %eax
+ sarl $3, %eax
+ ret
+
+which is probably slower, but it's interesting at least :)
+
+//===---------------------------------------------------------------------===//
+
+We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
+We should leave these as libcalls for everything over a much lower threshold,
+since libc is hand tuned for medium and large mem ops (avoiding RFO for large
+stores, TLB preheating, etc)
+
+//===---------------------------------------------------------------------===//
+
+Optimize this into something reasonable:
+ x * copysign(1.0, y) * copysign(1.0, z)
+
+//===---------------------------------------------------------------------===//
+
+Optimize copysign(x, *y) to use an integer load from y.
+
+//===---------------------------------------------------------------------===//
+
+The following tests perform worse with LSR:
+
+lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
+
+//===---------------------------------------------------------------------===//
+
+Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
+FR64 to VR128.
+
+//===---------------------------------------------------------------------===//
+
+Adding to the list of cmp / test poor codegen issues:
+
+int test(__m128 *A, __m128 *B) {
+ if (_mm_comige_ss(*A, *B))
+ return 3;
+ else
+ return 4;
+}
+
+_test:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ movl 4(%esp), %eax
+ movaps (%eax), %xmm1
+ comiss %xmm0, %xmm1
+ setae %al
+ movzbl %al, %ecx
+ movl $3, %eax
+ movl $4, %edx
+ cmpl $0, %ecx
+ cmove %edx, %eax
+ ret
+
+Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
+are a number of issues. 1) We are introducing a setcc between the result of the
+intrisic call and select. 2) The intrinsic is expected to produce a i32 value
+so a any extend (which becomes a zero extend) is added.
+
+We probably need some kind of target DAG combine hook to fix this.
+
+//===---------------------------------------------------------------------===//
+
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+
+There is also one case we do worse on PPC.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+ return a * 3;
+}
+
+We currently emits
+ imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+ movl 4(%esp), %eax
+ leal (%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
+
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
+
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
+//===---------------------------------------------------------------------===//
+
+__builtin_ffs codegen is messy.
+
+int ffs_(unsigned X) { return __builtin_ffs(X); }
+
+llvm produces:
+ffs_:
+ movl 4(%esp), %ecx
+ bsfl %ecx, %eax
+ movl $32, %edx
+ cmove %edx, %eax
+ incl %eax
+ xorl %edx, %edx
+ testl %ecx, %ecx
+ cmove %edx, %eax
+ ret
+
+vs gcc:
+
+_ffs_:
+ movl $-1, %edx
+ bsfl 4(%esp), %eax
+ cmove %edx, %eax
+ addl $1, %eax
+ ret
+
+Another example of __builtin_ffs (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+ if (j)
+ return __builtin_ffs (j) - 1;
+ else
+ return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
+
+//===---------------------------------------------------------------------===//
+
+define i32 @foo(i32* %a, i32 %t) {
+entry:
+ br label %cond_true
+
+cond_true: ; preds = %cond_true, %entry
+ %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
+ %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
+ %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
+ %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
+ %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
+ %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
+ %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
+ %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
+ br i1 %tmp, label %bb12, label %cond_true
+
+bb12: ; preds = %cond_true
+ ret i32 %tmp7
+}
+is pessimized by -loop-reduce and -indvars
+
+//===---------------------------------------------------------------------===//
+
+u32 to float conversion improvement:
+
+float uint32_2_float( unsigned u ) {
+ float fl = (int) (u & 0xffff);
+ float fh = (int) (u >> 16);
+ fh *= 0x1.0p16f;
+ return fh + fl;
+}
+
+00000000 subl $0x04,%esp
+00000003 movl 0x08(%esp,1),%eax
+00000007 movl %eax,%ecx
+00000009 shrl $0x10,%ecx
+0000000c cvtsi2ss %ecx,%xmm0
+00000010 andl $0x0000ffff,%eax
+00000015 cvtsi2ss %eax,%xmm1
+00000019 mulss 0x00000078,%xmm0
+00000021 addss %xmm1,%xmm0
+00000025 movss %xmm0,(%esp,1)
+0000002a flds (%esp,1)
+0000002d addl $0x04,%esp
+00000030 ret
+
+//===---------------------------------------------------------------------===//
+
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+
+int f(int a, int b) {
+ if (a == 4 || a == 6)
+ b++;
+ return b;
+}
+
+
+as:
+
+or eax, 2
+cmp eax, 6
+jz label
+
+//===---------------------------------------------------------------------===//
+
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b". For example, instead of:
+
+int G;
+void f(int X, int Y) {
+ G = X < 0 ? 14 : 13;
+}
+
+compiling to:
+
+_f:
+ movl $14, %eax
+ movl $13, %ecx
+ movl 4(%esp), %edx
+ testl %edx, %edx
+ cmovl %eax, %ecx
+ movl %ecx, _G
+ ret
+
+it could be:
+_f:
+ movl 4(%esp), %eax
+ sarl $31, %eax
+ notl %eax
+ addl $14, %eax
+ movl %eax, _G
+ ret
+
+etc.
+
+Another is:
+int usesbb(unsigned int a, unsigned int b) {
+ return (a < b ? -1 : 0);
+}
+to:
+_usesbb:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ sbbl %eax, %eax
+ ret
+
+instead of:
+_usesbb:
+ xorl %eax, %eax
+ movl 8(%esp), %ecx
+ cmpl %ecx, 4(%esp)
+ movl $4294967295, %ecx
+ cmovb %ecx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Currently we don't have elimination of redundant stack manipulations. Consider
+the code:
+
+int %main() {
+entry:
+ call fastcc void %test1( )
+ call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
+ ret int 0
+}
+
+declare fastcc void %test1()
+
+declare fastcc void %test2(sbyte*)
+
+
+This currently compiles to:
+
+ subl $16, %esp
+ call _test5
+ addl $12, %esp
+ subl $16, %esp
+ movl $_test5, (%esp)
+ call _test6
+ addl $12, %esp
+
+The add\sub pair is really unneeded here.
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+define i32 @test3(i32 %X) {
+ %tmp1 = urem i32 %X, 255
+ ret i32 %tmp1
+}
+
+Currently it compiles to:
+
+...
+ movl $2155905153, %ecx
+ movl 8(%esp), %esi
+ movl %esi, %eax
+ mull %ecx
+...
+
+This could be "reassociated" into:
+
+ movl $2155905153, %eax
+ movl 8(%esp), %ecx
+ mull %ecx
+
+to avoid the copy. In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction. I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary. In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
+942 942 0x3d03 movl %dh, (1809(%esp, %esi)
+937 937 0x3d0a incl %esi
+3 3 0x3d0b cmpb %bl, %dl
+27 27 0x3d0d jnz 0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+ if ((p[0] == 1) & (p[1] == 2)) return 1;
+ return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+Start using the flags more. For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+ if ((*x += y) == 0)
+ return a;
+ else
+ return b;
+}
+
+to:
+ addl %esi, (%rdi)
+ movl %edx, %eax
+ cmovne %ecx, %eax
+ ret
+instead of:
+
+_add_zf:
+ addl (%rdi), %esi
+ movl %esi, (%rdi)
+ testl %esi, %esi
+ cmove %edx, %ecx
+ movl %ecx, %eax
+ ret
+
+and:
+
+int add_zf(int *x, int y, int a, int b) {
+ if ((*x + y) < 0)
+ return a;
+ else
+ return b;
+}
+
+to:
+
+add_zf:
+ addl (%rdi), %esi
+ movl %edx, %eax
+ cmovns %ecx, %eax
+ ret
+
+instead of:
+
+_add_zf:
+ addl (%rdi), %esi
+ testl %esi, %esi
+ cmovs %edx, %ecx
+ movl %ecx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+These two functions have identical effects:
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ incl %ecx
+ movl 8(%esp), %edx
+ cmpl %edx, %ecx
+ jne LBB1_2 #UnifiedReturnBlock
+LBB1_1: #cond_true
+ addl $2, %eax
+ ret
+LBB1_2: #UnifiedReturnBlock
+ movl %ecx, %eax
+ ret
+_f2:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ incl %ecx
+ cmpl 8(%esp), %ecx
+ sete %cl
+ movzbl %cl, %ecx
+ leal 1(%ecx,%eax), %eax
+ ret
+
+both of which are inferior to GCC's:
+
+_f:
+ movl 4(%esp), %edx
+ leal 1(%edx), %eax
+ addl $2, %edx
+ cmpl 8(%esp), %eax
+ cmove %edx, %eax
+ ret
+_f2:
+ movl 4(%esp), %eax
+ addl $1, %eax
+ xorl %edx, %edx
+ cmpl 8(%esp), %eax
+ sete %dl
+ addl %edx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+ if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne LBB1_1
+ addl $12, %esp
+ ret
+LBB1_1:
+ call L_abort$stub
+
+It would be better to produce:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne L_abort$stub
+ addl $12, %esp
+ ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ subl $12, %esp
+ call L_abort$stub
+
+Both are useful in different situations. Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ pop %eax # realign stack.
+ call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+We need to teach the codegen to convert two-address INC instructions to LEA
+when the flags are dead (likewise dec). For example, on X86-64, compile:
+
+int foo(int A, int B) {
+ return A+1;
+}
+
+to:
+
+_foo:
+ leal 1(%edi), %eax
+ ret
+
+instead of:
+
+_foo:
+ incl %edi
+ movl %edi, %eax
+ ret
+
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y. Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered. However, this copy is not needed if the register
+;; allocator turns the shift into an LEA. This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN: not grep {mov E.X, E.X}
+
+@G = external global i32 ; <i32*> [#uses=3]
+
+define i32 @test1(i32 %X, i32 %Y) {
+ %Z = add i32 %X, %Y ; <i32> [#uses=1]
+ volatile store i32 %Y, i32* @G
+ volatile store i32 %Z, i32* @G
+ ret i32 %X
+}
+
+define i32 @test2(i32 %X) {
+ %Z = add i32 %X, 1 ; <i32> [#uses=1]
+ volatile store i32 %Z, i32* @G
+ ret i32 %X
+}
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction. Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+ movl $7, %eax
+ movsbl 4(%esp), %ecx
+ subl %ecx, %eax
+ ret
+
+We would use one fewer register if codegen'd as:
+
+ movsbl 4(%esp), %eax
+ neg %eax
+ add $7, %eax
+ ret
+
+Note that this isn't beneficial if the load can be folded into the sub. In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+ movl $7, %eax
+ subl 4(%esp), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Leaf functions that require one 4-byte spill slot have a prolog like this:
+
+_foo:
+ pushl %esi
+ subl $4, %esp
+...
+and an epilog like this:
+ addl $4, %esp
+ popl %esi
+ ret
+
+It would be smaller, and potentially faster, to push eax on entry and to
+pop into a dummy register instead of using addl/subl of esp. Just don't pop
+into any return registers :)
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should fold (branch (or (setcc, setcc))) into multiple
+branches. We generate really poor code for:
+
+double testf(double a) {
+ return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+For example, the entry BB is:
+
+_testf:
+ subl $20, %esp
+ pxor %xmm0, %xmm0
+ movsd 24(%esp), %xmm1
+ ucomisd %xmm0, %xmm1
+ setnp %al
+ sete %cl
+ testb %cl, %al
+ jne LBB1_5 # UnifiedReturnBlock
+LBB1_1: # cond_true
+
+
+it would be better to replace the last four instructions with:
+
+ jp LBB1_1
+ je LBB1_5
+LBB1_1:
+
+We also codegen the inner ?: into a diamond:
+
+ cvtss2sd LCPI1_0(%rip), %xmm2
+ cvtss2sd LCPI1_1(%rip), %xmm3
+ ucomisd %xmm1, %xmm0
+ ja LBB1_3 # cond_true
+LBB1_2: # cond_true
+ movapd %xmm3, %xmm2
+LBB1_3: # cond_true
+ movapd %xmm2, %xmm0
+ ret
+
+We should sink the load into xmm3 into the LBB1_2 block. This should
+be pretty easy, and will nuke all the copies.
+
+//===---------------------------------------------------------------------===//
+
+This:
+ #include <algorithm>
+ inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+ { return std::make_pair(a + b, a + b < a); }
+ bool no_overflow(unsigned a, unsigned b)
+ { return !full_add(a, b).second; }
+
+Should compile to:
+
+
+ _Z11no_overflowjj:
+ addl %edi, %esi
+ setae %al
+ ret
+
+FIXME: That code looks wrong; bool return is normally defined as zext.
+
+on x86-64, not:
+
+__Z11no_overflowjj:
+ addl %edi, %esi
+ cmpl %edi, %esi
+ setae %al
+ movzbl %al, %eax
+ ret
+
+
+//===---------------------------------------------------------------------===//
+
+Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
+condition register is dead. xor reg reg is shorter than mov reg, #0.
+
+//===---------------------------------------------------------------------===//
+
+We aren't matching RMW instructions aggressively
+enough. Here's a reduced testcase (more in PR1160):
+
+define void @test(i32* %huge_ptr, i32* %target_ptr) {
+ %A = load i32* %huge_ptr ; <i32> [#uses=1]
+ %B = load i32* %target_ptr ; <i32> [#uses=1]
+ %C = or i32 %A, %B ; <i32> [#uses=1]
+ store i32 %C, i32* %target_ptr
+ ret void
+}
+
+$ llvm-as < t.ll | llc -march=x86-64
+
+_test:
+ movl (%rdi), %eax
+ orl (%rsi), %eax
+ movl %eax, (%rsi)
+ ret
+
+That should be something like:
+
+_test:
+ movl (%rdi), %eax
+ orl %eax, (%rsi)
+ ret
+
+//===---------------------------------------------------------------------===//
+
+The following code:
+
+bb114.preheader: ; preds = %cond_next94
+ %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
+ %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
+ %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
+ %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
+ %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
+ %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
+ %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
+ %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
+ %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
+ %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
+ %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
+ br label %bb114
+
+produces:
+
+LBB3_5: # bb114.preheader
+ movswl -68(%ebp), %eax
+ movl $32, %ecx
+ movl %ecx, -80(%ebp)
+ subl %eax, -80(%ebp)
+ movswl -52(%ebp), %eax
+ movl %ecx, -84(%ebp)
+ subl %eax, -84(%ebp)
+ movswl -70(%ebp), %eax
+ movl %ecx, -88(%ebp)
+ subl %eax, -88(%ebp)
+ movswl -50(%ebp), %eax
+ subl %eax, %ecx
+ movl %ecx, -76(%ebp)
+ movswl -42(%ebp), %eax
+ movl %eax, -92(%ebp)
+ movswl -66(%ebp), %eax
+ movl %eax, -96(%ebp)
+ movw $0, -98(%ebp)
+
+This appears to be bad because the RA is not folding the store to the stack
+slot into the movl. The above instructions could be:
+ movl $32, -80(%ebp)
+...
+ movl $32, -84(%ebp)
+...
+This seems like a cross between remat and spill folding.
+
+This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
+change, so we could simply subtract %eax from %ecx first and then use %ecx (or
+vice-versa).
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+ %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
+ br i1 %tmp659, label %cond_true662, label %cond_next715
+
+produces this:
+
+ testw %cx, %cx
+ movswl %cx, %esi
+ jns LBB4_109 # cond_next715
+
+Shark tells us that using %cx in the testw instruction is sub-optimal. It
+suggests using the 32-bit register (which is what ICC uses).
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+void compare (long long foo) {
+ if (foo < 4294967297LL)
+ abort();
+}
+
+to:
+
+compare:
+ subl $4, %esp
+ cmpl $0, 8(%esp)
+ setne %al
+ movzbw %al, %ax
+ cmpl $1, 12(%esp)
+ setg %cl
+ movzbw %cl, %cx
+ cmove %ax, %cx
+ testb $1, %cl
+ jne .LBB1_2 # UnifiedReturnBlock
+.LBB1_1: # ifthen
+ call abort
+.LBB1_2: # UnifiedReturnBlock
+ addl $4, %esp
+ ret
+
+(also really horrible code on ppc). This is due to the expand code for 64-bit
+compares. GCC produces multiple branches, which is much nicer:
+
+compare:
+ subl $12, %esp
+ movl 20(%esp), %edx
+ movl 16(%esp), %eax
+ decl %edx
+ jle .L7
+.L5:
+ addl $12, %esp
+ ret
+ .p2align 4,,7
+.L7:
+ jl .L4
+ cmpl $0, %eax
+ .p2align 4,,8
+ ja .L5
+.L4:
+ .p2align 4,,9
+ call abort
+
+//===---------------------------------------------------------------------===//
+
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
+
+example:
+
+int callee(int32, int64);
+int caller(int32 arg1, int32 arg2) {
+ int64 local = arg2 * 2;
+ return callee(arg2, (int64)local);
+}
+
+[arg1] [!arg2 no longer valid since we moved local onto it]
+[arg2] -> [(int64)
+[RETADDR] local ]
+
+Moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+
+ - Analyse the actual parameters of the callee to see which would
+ overwrite a caller parameter which is used by the callee and only
+ push them onto the top of the stack.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg1,arg2);
+ }
+
+ Here we don't need to write any variables to the top of the stack
+ since they don't overwrite each other.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg2,arg1);
+ }
+
+ Here we need to push the arguments because they overwrite each
+ other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+ int i = 0;
+ unsigned long int z = 0;
+
+ do {
+ z -= 0x00004000;
+ i++;
+ if (i > 0x00040000)
+ abort ();
+ } while (z > 0);
+ exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+ subl $28, %esp
+ xorl %eax, %eax
+ jmp L2
+L3:
+ cmpl $262144, %eax
+ je L10
+L2:
+ addl $1, %eax
+ cmpl $262145, %eax
+ jne L3
+ call L_abort$stub
+L10:
+ movl $0, (%esp)
+ call L_exit$stub
+
+llvm:
+
+_main:
+ subl $12, %esp
+ movl $1, %eax
+ movl $16384, %ecx
+LBB1_1: # bb
+ cmpl $262145, %eax
+ jge LBB1_4 # cond_true
+LBB1_2: # cond_next
+ incl %eax
+ addl $4294950912, %ecx
+ cmpl $16384, %ecx
+ jne LBB1_1 # bb
+LBB1_3: # bb11
+ xorl %eax, %eax
+ addl $12, %esp
+ ret
+LBB1_4: # cond_true
+ call L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+ leal 1(%eax), %edx
+ cmpl $262145, %edx
+ =>
+ cmpl $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+ %Y = fptosi double %X to i64
+ ret i64 %Y
+}
+
+compiles to:
+
+_test:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl 4(%esp), %edx
+ movl (%esp), %eax
+ addl $20, %esp
+ #FP_REG_KILL
+ ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+ movzwl 4(%esp), %eax
+ orl $255, %eax
+ ret
+
+instead of:
+_foo:
+ movl $255, %eax
+ orl 4(%esp), %eax
+ andl $65535, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We're codegen'ing multiply of long longs inefficiently:
+
+unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
+ return arg1 * arg2;
+}
+
+We compile to (fomit-frame-pointer):
+
+_LLM:
+ pushl %esi
+ movl 8(%esp), %ecx
+ movl 16(%esp), %esi
+ movl %esi, %eax
+ mull %ecx
+ imull 12(%esp), %esi
+ addl %edx, %esi
+ imull 20(%esp), %ecx
+ movl %esi, %edx
+ addl %ecx, %edx
+ popl %esi
+ ret
+
+This looks like a scheduling deficiency and lack of remat of the load from
+the argument area. ICC apparently produces:
+
+ movl 8(%esp), %ecx
+ imull 12(%esp), %ecx
+ movl 16(%esp), %eax
+ imull 4(%esp), %eax
+ addl %eax, %ecx
+ movl 4(%esp), %eax
+ mull 12(%esp)
+ addl %ecx, %edx
+ ret
+
+Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg". Instead of:
+
+xorl %eax, %eax
+movl %eax, 124(%esp)
+
+we should get:
+
+movl $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+This testcase misses a read/modify/write opportunity (from PR1425):
+
+void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
+ int i;
+ for(i=0; i<width; i++)
+ b1[i] += (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We compile it down to:
+
+LBB1_2: # bb
+ movl (%esi,%edi,4), %ebx
+ addl (%ecx,%edi,4), %ebx
+ addl (%edx,%edi,4), %ebx
+ movl %ebx, (%ecx,%edi,4)
+ incl %edi
+ cmpl %eax, %edi
+ jne LBB1_2 # bb
+
+the inner loop should add to the memory location (%ecx,%edi,4), saving
+a mov. Something like:
+
+ movl (%esi,%edi,4), %ebx
+ addl (%edx,%edi,4), %ebx
+ addl %ebx, (%ecx,%edi,4)
+
+Here is another interesting example:
+
+void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
+ int i;
+ for(i=0; i<width; i++)
+ b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
+
+LBB9_2: # bb
+ movl (%ecx,%edi,4), %ebx
+ subl (%esi,%edi,4), %ebx
+ subl (%edx,%edi,4), %ebx
+ movl %ebx, (%ecx,%edi,4)
+ incl %edi
+ cmpl %eax, %edi
+ jne LBB9_2 # bb
+
+Additionally, LSR should rewrite the exit condition of these loops to use
+a stride-4 IV, would would allow all the scales in the loop to go away.
+This would result in smaller code and more efficient microops.
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+ xorpd LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+ movsd (%esp), %xmm0
+ xorpd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+ xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall. Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+ double X = *P;
+ a = X;
+ bar();
+ X = -X;
+ b = X;
+ bar();
+ c = X;
+}
+
+//===---------------------------------------------------------------------===//
+
+handling llvm.memory.barrier on pre SSE2 cpus
+
+should generate:
+lock ; mov %esp, %esp
+
+//===---------------------------------------------------------------------===//
+
+The generated code on x86 for checking for signed overflow on a multiply the
+obvious way is much longer than it needs to be.
+
+int x(int a, int b) {
+ long long prod = (long long)a*b;
+ return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
+}
+
+See PR2053 for more details.
+
+//===---------------------------------------------------------------------===//
+
+We should investigate using cdq/ctld (effect: edx = sar eax, 31)
+more aggressively; it should cost the same as a move+shift on any modern
+processor, but it's a lot shorter. Downside is that it puts more
+pressure on register allocation because it has fixed operands.
+
+Example:
+int abs(int x) {return x < 0 ? -x : x;}
+
+gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
+abs:
+ movl 4(%esp), %eax
+ cltd
+ xorl %edx, %eax
+ subl %edx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+int test(unsigned long a, unsigned long b) { return -(a < b); }
+
+We currently compile this to:
+
+define i32 @test(i32 %a, i32 %b) nounwind {
+ %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
+ %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
+ %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1]
+ ret i32 %tmp5
+}
+
+and
+
+_test:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setb %al
+ movzbl %al, %eax
+ negl %eax
+ ret
+
+Several deficiencies here. First, we should instcombine zext+neg into sext:
+
+define i32 @test2(i32 %a, i32 %b) nounwind {
+ %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
+ %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1]
+ ret i32 %tmp34
+}
+
+However, before we can do that, we have to fix the bad codegen that we get for
+sext from bool:
+
+_test2:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setb %al
+ movzbl %al, %eax
+ shll $31, %eax
+ sarl $31, %eax
+ ret
+
+This code should be at least as good as the code above. Once this is fixed, we
+can optimize this specific case even more to:
+
+ movl 8(%esp), %eax
+ xorl %ecx, %ecx
+ cmpl %eax, 4(%esp)
+ sbbl %ecx, %ecx
+
+//===---------------------------------------------------------------------===//
+
+Take the following code (from
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
+
+extern unsigned char first_one[65536];
+int FirstOnet(unsigned long long arg1)
+{
+ if (arg1 >> 48)
+ return (first_one[arg1 >> 48]);
+ return 0;
+}
+
+
+The following code is currently generated:
+FirstOnet:
+ movl 8(%esp), %eax
+ cmpl $65536, %eax
+ movl 4(%esp), %ecx
+ jb .LBB1_2 # UnifiedReturnBlock
+.LBB1_1: # ifthen
+ shrl $16, %eax
+ movzbl first_one(%eax), %eax
+ ret
+.LBB1_2: # UnifiedReturnBlock
+ xorl %eax, %eax
+ ret
+
+There are a few possible improvements here:
+1. We should be able to eliminate the dead load into %ecx
+2. We could change the "movl 8(%esp), %eax" into
+ "movzwl 10(%esp), %eax"; this lets us change the cmpl
+ into a testl, which is shorter, and eliminate the shift.
+
+We could also in theory eliminate the branch by using a conditional
+for the address of the load, but that seems unlikely to be worthwhile
+in general.
+
+//===---------------------------------------------------------------------===//
+
+We compile this function:
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
+entry:
+ %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
+ br i1 %tmp2, label %bb7, label %bb
+
+bb: ; preds = %entry
+ %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
+ ret i32 %tmp6
+
+bb7: ; preds = %entry
+ %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
+ ret i32 %tmp10
+}
+
+to:
+
+_foo:
+ cmpb $0, 16(%esp)
+ movl 12(%esp), %ecx
+ movl 8(%esp), %eax
+ movl 4(%esp), %edx
+ je LBB1_2 # bb7
+LBB1_1: # bb
+ addl %edx, %eax
+ ret
+LBB1_2: # bb7
+ movl %edx, %eax
+ subl %ecx, %eax
+ ret
+
+The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
+if it commuted the addl in LBB1_1.
+
+//===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15: # bb310
+ cvtss2sd LCPI1_0, %xmm1
+ addsd %xmm1, %xmm0
+ movsd 176(%esp), %xmm2
+ mulsd %xmm0, %xmm2
+ movapd %xmm2, %xmm3
+ mulsd %xmm3, %xmm3
+ movapd %xmm3, %xmm4
+ mulsd LCPI1_23, %xmm4
+ addsd LCPI1_24, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_25, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_26, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_27, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_28, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd %xmm1, %xmm4
+ mulsd %xmm2, %xmm4
+ movsd 152(%esp), %xmm1
+ addsd %xmm4, %xmm1
+ movsd %xmm1, 152(%esp)
+ incl %eax
+ cmpl %eax, %esi
+ jge LBB1_15 # bb310
+LBB1_16: # bb358.loopexit
+ movsd 152(%esp), %xmm0
+ addsd %xmm0, %xmm0
+ addsd LCPI1_22, %xmm0
+ movsd %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
+
+//===---------------------------------------------------------------------===//
+
+Legalize loses track of the fact that bools are always zero extended when in
+memory. This causes us to compile abort_gzip (from 164.gzip) from:
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind {
+entry:
+ %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
+ br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i: ; preds = %entry
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+bb4.i: ; preds = %entry
+ store i1 true, i1* @in_exit.4870.b
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+}
+declare void @exit(i32) noreturn nounwind
+
+into:
+
+_abort_gzip:
+ subl $12, %esp
+ movb _in_exit.4870.b, %al
+ notb %al
+ testb $1, %al
+ jne LBB1_2 ## bb4.i
+LBB1_1: ## bb.i
+ ...
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+ return x-y-1;
+}
+
+into (-m64):
+
+_test:
+ decl %edi
+ movl %edi, %eax
+ subl %esi, %eax
+ ret
+
+it would be better to codegen as: x+~y (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+ subq $200, %rsp
+ movaps %xmm7, 160(%rsp)
+ movaps %xmm6, 144(%rsp)
+ movaps %xmm5, 128(%rsp)
+ movaps %xmm4, 112(%rsp)
+ movaps %xmm3, 96(%rsp)
+ movaps %xmm2, 80(%rsp)
+ movaps %xmm1, 64(%rsp)
+ movaps %xmm0, 48(%rsp)
+ movq %r9, 40(%rsp)
+ movq %r8, 32(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 8(%rsp)
+ leaq (%rsp), %rax
+ movq %rax, 192(%rsp)
+ leaq 208(%rsp), %rax
+ movq %rax, 184(%rsp)
+ movl $48, 180(%rsp)
+ movl $8, 176(%rsp)
+ movl 176(%rsp), %eax
+ cmpl $47, %eax
+ jbe .LBB1_3 # bb
+.LBB1_1: # bb3
+ movq 184(%rsp), %rcx
+ leaq 8(%rcx), %rax
+ movq %rax, 184(%rsp)
+.LBB1_2: # bb4
+ movl (%rcx), %eax
+ addq $200, %rsp
+ ret
+.LBB1_3: # bb
+ movl %eax, %ecx
+ addl $8, %eax
+ addq 192(%rsp), %rcx
+ movl %eax, 176(%rsp)
+ jmp .LBB1_2 # bb4
+
+gcc 4.3 generates:
+ subq $96, %rsp
+.LCFI0:
+ leaq 104(%rsp), %rax
+ movq %rsi, -80(%rsp)
+ movl $8, -120(%rsp)
+ movq %rax, -112(%rsp)
+ leaq -88(%rsp), %rax
+ movq %rax, -104(%rsp)
+ movl $8, %eax
+ cmpl $48, %eax
+ jb .L6
+ movq -112(%rsp), %rdx
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L6:
+ mov %eax, %edx
+ addq -104(%rsp), %rdx
+ addl $8, %eax
+ movl %eax, -120(%rsp)
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+
+and it gets compiled into this on x86:
+ pushl %ebp
+ movl %esp, %ebp
+ subl $4, %esp
+ leal 12(%ebp), %eax
+ movl %eax, -4(%ebp)
+ leal 16(%ebp), %eax
+ movl %eax, -4(%ebp)
+ movl 12(%ebp), %eax
+ addl $4, %esp
+ popl %ebp
+ ret
+
+gcc 4.3 generates:
+ pushl %ebp
+ movl %esp, %ebp
+ movl 12(%ebp), %eax
+ popl %ebp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall. The expected code is something
+like the following:
+
+ movl 12(%ebp), %eax
+ movl 16(%ebp), %ecx
+ xorl %edx, %edx
+ divl %ecx
+ movl 8(%ebp), %eax
+ divl %ecx
+ movl %edx, %eax
+ ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+ return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+ return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %sum = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %normal
+normal:
+ ret i32 %sum
+overflow:
+ call void @llvm.trap()
+ unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+ movl 4(%esp), %eax
+ addl 8(%esp), %eax
+ jo LBB1_2 ## overflow
+LBB1_1: ## normal
+ ret
+LBB1_2: ## overflow
+ ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+ movl %ebx, %eax
+ imull (%edi,%ecx,4)
+ shrdl $31, %edx, %eax
+ addl %eax, (%esi,%ecx,4)
+ incl %ecx
+ cmpl $149, %ecx
+ jle .L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1: ## bb1
+ movl 24(%esp), %eax
+ movl (%eax,%edi,4), %ebx
+ movl %ebx, %ebp
+ imull %esi, %ebp
+ movl %ebx, %eax
+ mull %ecx
+ addl %ebp, %edx
+ sarl $31, %ebx
+ imull %ecx, %ebx
+ addl %edx, %ebx
+ shldl $1, %eax, %ebx
+ movl 20(%esp), %eax
+ addl %ebx, (%eax,%edi,4)
+ incl %edi
+ cmpl $150, %edi
+ jne LBB1_1 ## bb1
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible. A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+ %tmp = alloca <16 x float>, align 16
+ %tmp2 = alloca <16 x float>, align 16
+ store <16 x float> %A, <16 x float>* %tmp
+ %s = bitcast <16 x float>* %tmp to i8*
+ %s2 = bitcast <16 x float>* %tmp2 to i8*
+ call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+ %R = load <16 x float>* %tmp2
+ ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+ subl $140, %esp
+ movaps %xmm3, 112(%esp)
+ movaps %xmm2, 96(%esp)
+ movaps %xmm1, 80(%esp)
+ movaps %xmm0, 64(%esp)
+ movl 60(%esp), %eax
+ movl %eax, 124(%esp)
+ movl 56(%esp), %eax
+ movl %eax, 120(%esp)
+ movl 52(%esp), %eax
+ <many many more 32-bit copies>
+ movaps (%esp), %xmm0
+ movaps 16(%esp), %xmm1
+ movaps 32(%esp), %xmm2
+ movaps 48(%esp), %xmm3
+ addl $140, %esp
+ ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors. GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+ preceeded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+ code contains more than 3 branches.
+
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+ Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
new file mode 100644
index 0000000..fd13b02
--- /dev/null
+++ b/lib/Target/X86/X86.h
@@ -0,0 +1,84 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class X86TargetMachine;
+class FunctionPass;
+class MachineCodeEmitter;
+class JITCodeEmitter;
+class raw_ostream;
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createX87FPRegKillInserterPass - This function returns a pass which
+/// inserts FP_REG_KILL instructions where needed.
+///
+FunctionPass *createX87FPRegKillInserterPass();
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.
+///
+FunctionPass *createX86CodePrinterPass(raw_ostream &o,
+ X86TargetMachine &tm,
+ CodeGenOpt::Level OptLevel,
+ bool Verbose);
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+
+FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM,
+ MachineCodeEmitter &MCE);
+FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
+ JITCodeEmitter &JCE);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+/// createX86MaxStackAlignmentCalculatorPass - This function returns a pass
+/// which calculates maximal stack alignment required for function
+///
+FunctionPass *createX86MaxStackAlignmentCalculatorPass();
+
+} // End llvm namespace
+
+// Defines symbolic names for X86 registers. This defines a mapping from
+// register name to register number.
+//
+#include "X86GenRegisterNames.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#include "X86GenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
new file mode 100644
index 0000000..8df138d
--- /dev/null
+++ b/lib/Target/X86/X86.td
@@ -0,0 +1,184 @@
+//===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, refered to
+// here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+ "Enable MMX instructions">;
+def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+ "Enable SSE instructions",
+ [FeatureMMX]>;
+def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+ "Enable SSE2 instructions",
+ [FeatureSSE1]>;
+def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+ "Enable SSE3 instructions",
+ [FeatureSSE2]>;
+def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+ "Enable SSSE3 instructions",
+ [FeatureSSE3]>;
+def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+ "Enable SSE 4.1 instructions",
+ [FeatureSSSE3]>;
+def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+ "Enable SSE 4.2 instructions",
+ [FeatureSSE41]>;
+def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+ "Enable 3DNow! instructions">;
+def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+ "Enable 3DNow! Athlon instructions",
+ [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
+ "Support 64-bit instructions">;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+ "Bit testing of memory is slow">;
+def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+ "Support SSE 4a instructions">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+def : Proc<"i386", []>;
+def : Proc<"i486", []>;
+def : Proc<"i586", []>;
+def : Proc<"pentium", []>;
+def : Proc<"pentium-mmx", [FeatureMMX]>;
+def : Proc<"i686", []>;
+def : Proc<"pentiumpro", []>;
+def : Proc<"pentium2", [FeatureMMX]>;
+def : Proc<"pentium3", [FeatureSSE1]>;
+def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"pentium4", [FeatureSSE2]>;
+def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem]>;
+
+def : Proc<"k6", [FeatureMMX]>;
+def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>;
+def : Proc<"k6-3", [FeatureMMX, Feature3DNow]>;
+def : Proc<"athlon", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-tbird", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A,
+ Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"barcelona", [FeatureSSE3, FeatureSSE4A,
+ Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+
+def : Proc<"winchip-c6", [FeatureMMX]>;
+def : Proc<"winchip2", [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3", [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3-2", [FeatureSSE1]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo {
+
+ // Define how we want to layout our TargetSpecific information field... This
+ // should be kept up-to-date with the fields in the X86InstrInfo.h file.
+ let TSFlagsFields = ["FormBits",
+ "hasOpSizePrefix",
+ "hasAdSizePrefix",
+ "Prefix",
+ "hasREX_WPrefix",
+ "ImmTypeBits",
+ "FPFormBits",
+ "hasLockPrefix",
+ "SegOvrBits",
+ "Opcode"];
+ let TSFlagsShifts = [0,
+ 6,
+ 7,
+ 8,
+ 12,
+ 13,
+ 16,
+ 19,
+ 20,
+ 24];
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+ string AsmWriterClassName = "ATTAsmPrinter";
+ int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+ string AsmWriterClassName = "IntelAsmPrinter";
+ int Variant = 1;
+}
+
+
+def X86 : Target {
+ // Information about the instructions...
+ let InstructionSet = X86InstrInfo;
+
+ let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/lib/Target/X86/X86COFF.h b/lib/Target/X86/X86COFF.h
new file mode 100644
index 0000000..0a8e4e6
--- /dev/null
+++ b/lib/Target/X86/X86COFF.h
@@ -0,0 +1,95 @@
+//===--- X86COFF.h - Some definitions from COFF documentations ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file just defines some symbols found in COFF documentation. They are
+// used to emit function type information for COFF targets (Cygwin/Mingw32).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_H
+#define X86COFF_H
+
+namespace COFF
+{
+/// Storage class tells where and what the symbol represents
+enum StorageClass {
+ C_EFCN = -1, ///< Physical end of function
+ C_NULL = 0, ///< No symbol
+ C_AUTO = 1, ///< External definition
+ C_EXT = 2, ///< External symbol
+ C_STAT = 3, ///< Static
+ C_REG = 4, ///< Register variable
+ C_EXTDEF = 5, ///< External definition
+ C_LABEL = 6, ///< Label
+ C_ULABEL = 7, ///< Undefined label
+ C_MOS = 8, ///< Member of structure
+ C_ARG = 9, ///< Function argument
+ C_STRTAG = 10, ///< Structure tag
+ C_MOU = 11, ///< Member of union
+ C_UNTAG = 12, ///< Union tag
+ C_TPDEF = 13, ///< Type definition
+ C_USTATIC = 14, ///< Undefined static
+ C_ENTAG = 15, ///< Enumeration tag
+ C_MOE = 16, ///< Member of enumeration
+ C_REGPARM = 17, ///< Register parameter
+ C_FIELD = 18, ///< Bit field
+
+ C_BLOCK = 100, ///< ".bb" or ".eb" - beginning or end of block
+ C_FCN = 101, ///< ".bf" or ".ef" - beginning or end of function
+ C_EOS = 102, ///< End of structure
+ C_FILE = 103, ///< File name
+ C_LINE = 104, ///< Line number, reformatted as symbol
+ C_ALIAS = 105, ///< Duplicate tag
+ C_HIDDEN = 106 ///< External symbol in dmert public lib
+};
+
+/// The type of the symbol. This is made up of a base type and a derived type.
+/// For example, pointer to int is "pointer to T" and "int"
+enum SymbolType {
+ T_NULL = 0, ///< No type info
+ T_ARG = 1, ///< Void function argument (only used by compiler)
+ T_VOID = 1, ///< The same as above. Just named differently in some specs.
+ T_CHAR = 2, ///< Character
+ T_SHORT = 3, ///< Short integer
+ T_INT = 4, ///< Integer
+ T_LONG = 5, ///< Long integer
+ T_FLOAT = 6, ///< Floating point
+ T_DOUBLE = 7, ///< Double word
+ T_STRUCT = 8, ///< Structure
+ T_UNION = 9, ///< Union
+ T_ENUM = 10, ///< Enumeration
+ T_MOE = 11, ///< Member of enumeration
+ T_UCHAR = 12, ///< Unsigned character
+ T_USHORT = 13, ///< Unsigned short
+ T_UINT = 14, ///< Unsigned integer
+ T_ULONG = 15 ///< Unsigned long
+};
+
+/// Derived type of symbol
+enum SymbolDerivedType {
+ DT_NON = 0, ///< No derived type
+ DT_PTR = 1, ///< Pointer to T
+ DT_FCN = 2, ///< Function returning T
+ DT_ARY = 3 ///< Array of T
+};
+
+/// Masks for extracting parts of type
+enum SymbolTypeMasks {
+ N_BTMASK = 017, ///< Mask for base type
+ N_TMASK = 060 ///< Mask for derived type
+};
+
+/// Offsets of parts of type
+enum Shifts {
+ N_BTSHFT = 4 ///< Type is formed as (base + derived << N_BTSHIFT)
+};
+
+}
+
+#endif // X86COFF_H
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..7f99203
--- /dev/null
+++ b/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,360 @@
+//===- X86CallingConv.td - Calling Conventions X86 32/64 ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+ // Scalar values are returned in AX first, then DX. For i8, the ABI
+ // requires the values to be in AL and AH, however this code uses AL and DL
+ // instead. This is because using AH for the second register conflicts with
+ // the way LLVM does multiple return values -- a return of {i16,i8} would end
+ // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+ // for functions that return two i8 values are currently expected to pack the
+ // values into an i16 (which uses AX, and thus AL:AH).
+ CCIfType<[i8] , CCAssignToReg<[AL, DL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+
+ // Vector types are returned in XMM0 and XMM1, when they fit. XMMM2 and XMM3
+ // can only be used by ABI non-compliant code. If the target doesn't have XMM
+ // registers, it won't have vector types.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // MMX vector types are always returned in MM0. If the target doesn't have
+ // MM0, it doesn't support these vector types.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToReg<[MM0]>>,
+
+ // Long double types are always returned in ST0 (even with SSE).
+ CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+ // The X86-32 calling convention returns FP values in ST0, unless marked
+ // with "inreg" (used here to distinguish one kind of reg from another,
+ // weirdly; this is really the sse-regparm calling convention) in which
+ // case they use XMM0, otherwise it is the same as the common X86 calling
+ // conv.
+ CCIfInReg<CCIfSubtarget<"hasSSE2()",
+ CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+ CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+ // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+ // SSE2, otherwise it is the the C calling conventions.
+ // This can happen when a float, 2 x float, or 3 x float vector is split by
+ // target lowering, and is returned in 1-3 sse regs.
+ CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+ CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+ // The X86-64 calling convention always returns FP values in XMM0.
+ CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+
+ // MMX vector types are always returned in XMM0 except for v1i64 which is
+ // returned in RAX. This disagrees with ABI documentation but is bug
+ // compatible with gcc.
+ CCIfType<[v1i64], CCAssignToReg<[RAX]>>,
+ CCIfType<[v8i8, v4i16, v2i32, v2f32], CCAssignToReg<[XMM0, XMM1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+ // The X86-Win64 calling convention always returns __m64 values in RAX.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[RAX]>>,
+
+ // And FP in XMM0 only.
+ CCIfType<[f32], CCAssignToReg<[XMM0]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0]>>,
+
+ // Otherwise, everything is the same as 'normal' X86-64 C CC.
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+ // If FastCC, use RetCC_X86_32_Fast.
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ // Otherwise, use RetCC_X86_32_C.
+ CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+ CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+ // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
+ // registers on Darwin.
+ CCIfType<[v8i8, v4i16, v2i32, v2f32],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>>,
+
+ // The first 8 v1i64 vector arguments are passed in GPRs on Darwin.
+ CCIfType<[v1i64],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToStack<8, 8>>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+ // FIXME: Handle byval stuff.
+ // FIXME: Handle varargs.
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // The first 4 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The first 4 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+ [RCX , RDX , R8 , R9 ]>>,
+
+ // The first 4 MMX vector arguments are passed in GPRs.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32],
+ CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 16-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 16>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 16-byte aligned.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 16>>
+]>;
+
+// Tail call convention (fast): One register is reserved for target address,
+// namely R9
+def CC_X86_64_TailCall : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+ // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
+ // registers on Darwin.
+ CCIfType<[v8i8, v4i16, v2i32, v2f32],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+ // The first 8 v1i64 vector arguments are passed in GPRs on Darwin.
+ CCIfType<[v1i64],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // The first 3 float or double arguments, if marked 'inreg' and if the call
+ // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+ // The first 3 __m64 (except for v1i64) vector arguments are passed in mmx
+ // registers if the call is not a vararg call.
+ CCIfNotVarArg<CCIfType<[v8i8, v4i16, v2i32, v2f32],
+ CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Doubles get 8-byte slots that are 4-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+ // Long doubles get slots whose size depends on the subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+ // The first 4 SSE vector arguments are passed in XMM registers.
+ CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+ // passed in the parameter area.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 4>>]>;
+
+def CC_X86_32_C : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in ECX.
+ CCIfNest<CCAssignToReg<[ECX]>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+ // Handles byval parameters. Note that we can't rely on the delegation
+ // to CC_X86_32_Common for this because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+ // The first 3 float or double arguments, if the call is not a vararg
+ // call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+ // Doubles get 8-byte slots that are 8-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 0000000..e988a5c
--- /dev/null
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -0,0 +1,811 @@
+//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "X86Relocations.h"
+#include "X86.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+template<class CodeEmitter>
+ class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass {
+ const X86InstrInfo *II;
+ const TargetData *TD;
+ X86TargetMachine &TM;
+ CodeEmitter &MCE;
+ intptr_t PICBaseOffset;
+ bool Is64BitMode;
+ bool IsPIC;
+ public:
+ static char ID;
+ explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
+ : MachineFunctionPass(&ID), II(0), TD(0), TM(tm),
+ MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+ IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+ Emitter(X86TargetMachine &tm, CodeEmitter &mce,
+ const X86InstrInfo &ii, const TargetData &td, bool is64)
+ : MachineFunctionPass(&ID), II(&ii), TD(&td), TM(tm),
+ MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
+ IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+ bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "X86 Machine Code Emitter";
+ }
+
+ void emitInstruction(const MachineInstr &MI,
+ const TargetInstrDesc *Desc);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineModuleInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+ void emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+ intptr_t Disp = 0, intptr_t PCAdj = 0,
+ bool NeedStub = false, bool Indirect = false);
+ void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+ void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0,
+ intptr_t PCAdj = 0);
+ void emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+ intptr_t PCAdj = 0);
+
+ void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+ intptr_t PCAdj = 0);
+
+ void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+ void emitRegModRMByte(unsigned RegOpcodeField);
+ void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+ void emitConstant(uint64_t Val, unsigned Size);
+
+ void emitMemModRMByte(const MachineInstr &MI,
+ unsigned Op, unsigned RegOpcodeField,
+ intptr_t PCAdj = 0);
+
+ unsigned getX86RegNum(unsigned RegNo) const;
+
+ bool gvNeedsNonLazyPtr(const GlobalValue *GV);
+ };
+
+template<class CodeEmitter>
+ char Emitter<CodeEmitter>::ID = 0;
+}
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified templated MachineCodeEmitter object.
+
+namespace llvm {
+
+FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM,
+ MachineCodeEmitter &MCE) {
+ return new Emitter<MachineCodeEmitter>(TM, MCE);
+}
+FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
+ JITCodeEmitter &JCE) {
+ return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+} // end namespace llvm
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+
+ MCE.setModuleInfo(&getAnalysis<MachineModuleInfo>());
+
+ II = TM.getInstrInfo();
+ TD = TM.getTargetData();
+ Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
+ IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+
+ do {
+ DOUT << "JITTing function '" << MF.getFunction()->getName() << "'\n";
+ MCE.startFunction(MF);
+ for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+ MBB != E; ++MBB) {
+ MCE.StartMachineBasicBlock(MBB);
+ for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+ const TargetInstrDesc &Desc = I->getDesc();
+ emitInstruction(*I, &Desc);
+ // MOVPC32r is basically a call plus a pop instruction.
+ if (Desc.getOpcode() == X86::MOVPC32r)
+ emitInstruction(*I, &II->get(X86::POP32r));
+ NumEmitted++; // Keep track of the # of mi's emitted
+ }
+ }
+ } while (MCE.finishFunction(MF));
+
+ return false;
+}
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+ // Remember where this reference was and where it is to so we can
+ // deal with it later.
+ MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+ X86::reloc_pcrel_word, MBB));
+ MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+ intptr_t Disp /* = 0 */,
+ intptr_t PCAdj /* = 0 */,
+ bool NeedStub /* = false */,
+ bool Indirect /* = false */) {
+ intptr_t RelocCST = 0;
+ if (Reloc == X86::reloc_picrel_word)
+ RelocCST = PICBaseOffset;
+ else if (Reloc == X86::reloc_pcrel_word)
+ RelocCST = PCAdj;
+ MachineRelocation MR = Indirect
+ ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+ GV, RelocCST, NeedStub)
+ : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+ GV, RelocCST, NeedStub);
+ MCE.addRelocation(MR);
+ // The relocated value will be added to the displacement
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(Disp);
+ else
+ MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES,
+ unsigned Reloc) {
+ intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0;
+ MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+ Reloc, ES, RelocCST));
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(0);
+ else
+ MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+ intptr_t Disp /* = 0 */,
+ intptr_t PCAdj /* = 0 */) {
+ intptr_t RelocCST = 0;
+ if (Reloc == X86::reloc_picrel_word)
+ RelocCST = PICBaseOffset;
+ else if (Reloc == X86::reloc_pcrel_word)
+ RelocCST = PCAdj;
+ MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+ Reloc, CPI, RelocCST));
+ // The relocated value will be added to the displacement
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(Disp);
+ else
+ MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+ intptr_t PCAdj /* = 0 */) {
+ intptr_t RelocCST = 0;
+ if (Reloc == X86::reloc_picrel_word)
+ RelocCST = PICBaseOffset;
+ else if (Reloc == X86::reloc_pcrel_word)
+ RelocCST = PCAdj;
+ MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+ Reloc, JTI, RelocCST));
+ // The relocated value will be added to the displacement
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(0);
+ else
+ MCE.emitWordLE(0);
+}
+
+template<class CodeEmitter>
+unsigned Emitter<CodeEmitter>::getX86RegNum(unsigned RegNo) const {
+ return II->getRegisterInfo().getX86RegNum(RegNo);
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+ unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
+ unsigned RegOpcodeFld){
+ MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
+ MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
+ unsigned Index,
+ unsigned Base) {
+ // SIB byte is in the same format as the ModRMByte...
+ MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
+ // Output the constant in little endian byte order...
+ for (unsigned i = 0; i != Size; ++i) {
+ MCE.emitByte(Val & 255);
+ Val >>= 8;
+ }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+ return Value == (signed char)Value;
+}
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::gvNeedsNonLazyPtr(const GlobalValue *GV) {
+ // For Darwin, simulate the linktime GOT by using the same non-lazy-pointer
+ // mechanism as 32-bit mode.
+ return (!Is64BitMode || TM.getSubtarget<X86Subtarget>().isTargetDarwin()) &&
+ TM.getSubtarget<X86Subtarget>().GVRequiresExtraLoad(GV, TM, false);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
+ int DispVal, intptr_t PCAdj) {
+ // If this is a simple integer displacement that doesn't require a relocation,
+ // emit it now.
+ if (!RelocOp) {
+ emitConstant(DispVal, 4);
+ return;
+ }
+
+ // Otherwise, this is something that requires a relocation. Emit it as such
+ // now.
+ if (RelocOp->isGlobal()) {
+ // In 64-bit static small code model, we could potentially emit absolute.
+ // But it's probably not beneficial.
+ // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative
+ // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ bool NeedStub = isa<Function>(RelocOp->getGlobal());
+ bool Indirect = gvNeedsNonLazyPtr(RelocOp->getGlobal());
+ emitGlobalAddress(RelocOp->getGlobal(), rt, RelocOp->getOffset(),
+ PCAdj, NeedStub, Indirect);
+ } else if (RelocOp->isCPI()) {
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_picrel_word;
+ emitConstPoolAddress(RelocOp->getIndex(), rt,
+ RelocOp->getOffset(), PCAdj);
+ } else if (RelocOp->isJTI()) {
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_picrel_word;
+ emitJumpTableAddress(RelocOp->getIndex(), rt, PCAdj);
+ } else {
+ assert(0 && "Unknown value to relocate!");
+ }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
+ unsigned Op, unsigned RegOpcodeField,
+ intptr_t PCAdj) {
+ const MachineOperand &Op3 = MI.getOperand(Op+3);
+ int DispVal = 0;
+ const MachineOperand *DispForReloc = 0;
+
+ // Figure out what sort of displacement we have to handle here.
+ if (Op3.isGlobal()) {
+ DispForReloc = &Op3;
+ } else if (Op3.isCPI()) {
+ if (Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex());
+ DispVal += Op3.getOffset();
+ }
+ } else if (Op3.isJTI()) {
+ if (Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex());
+ }
+ } else {
+ DispVal = Op3.getImm();
+ }
+
+ const MachineOperand &Base = MI.getOperand(Op);
+ const MachineOperand &Scale = MI.getOperand(Op+1);
+ const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+ unsigned BaseReg = Base.getReg();
+
+ // Is a SIB byte needed?
+ if ((!Is64BitMode || DispForReloc || BaseReg != 0) &&
+ IndexReg.getReg() == 0 &&
+ (BaseReg == 0 || getX86RegNum(BaseReg) != N86::ESP)) {
+ if (BaseReg == 0) { // Just a displacement?
+ // Emit special case [disp32] encoding
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+
+ emitDisplacementField(DispForReloc, DispVal, PCAdj);
+ } else {
+ unsigned BaseRegNo = getX86RegNum(BaseReg);
+ if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+ // Emit simple indirect register encoding... [EAX] f.e.
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+ } else if (!DispForReloc && isDisp8(DispVal)) {
+ // Emit the disp8 encoding... [REG+disp8]
+ MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+ emitConstant(DispVal, 1);
+ } else {
+ // Emit the most general non-SIB encoding: [REG+disp32]
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+ emitDisplacementField(DispForReloc, DispVal, PCAdj);
+ }
+ }
+
+ } else { // We need a SIB byte, so start by outputting the ModR/M byte first
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ bool ForceDisp8 = false;
+ if (BaseReg == 0) {
+ // If there is no base register, we emit the special case SIB byte with
+ // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+ ForceDisp32 = true;
+ } else if (DispForReloc) {
+ // Emit the normal disp32 encoding.
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+ ForceDisp32 = true;
+ } else if (DispVal == 0 && getX86RegNum(BaseReg) != N86::EBP) {
+ // Emit no displacement ModR/M byte
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+ } else if (isDisp8(DispVal)) {
+ // Emit the disp8 encoding...
+ MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else {
+ // Emit the normal disp32 encoding...
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+ }
+
+ // Calculate what the SS field value should be...
+ static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+ unsigned SS = SSTable[Scale.getImm()];
+
+ if (BaseReg == 0) {
+ // Handle the SIB byte for the case where there is no base. The
+ // displacement has already been output.
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = getX86RegNum(IndexReg.getReg());
+ else
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ emitSIBByte(SS, IndexRegNo, 5);
+ } else {
+ unsigned BaseRegNo = getX86RegNum(BaseReg);
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = getX86RegNum(IndexReg.getReg());
+ else
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ emitSIBByte(SS, IndexRegNo, BaseRegNo);
+ }
+
+ // Do we need to output a displacement?
+ if (ForceDisp8) {
+ emitConstant(DispVal, 1);
+ } else if (DispVal != 0 || ForceDisp32) {
+ emitDisplacementField(DispForReloc, DispVal, PCAdj);
+ }
+ }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(
+ const MachineInstr &MI,
+ const TargetInstrDesc *Desc) {
+ DOUT << MI;
+
+ unsigned Opcode = Desc->Opcode;
+
+ // Emit the lock opcode prefix as needed.
+ if (Desc->TSFlags & X86II::LOCK) MCE.emitByte(0xF0);
+
+ // Emit segment override opcode prefix as needed.
+ switch (Desc->TSFlags & X86II::SegOvrMask) {
+ case X86II::FS:
+ MCE.emitByte(0x64);
+ break;
+ case X86II::GS:
+ MCE.emitByte(0x65);
+ break;
+ default: assert(0 && "Invalid segment!");
+ case 0: break; // No segment override!
+ }
+
+ // Emit the repeat opcode prefix as needed.
+ if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3);
+
+ // Emit the operand size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::OpSize) MCE.emitByte(0x66);
+
+ // Emit the address size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::AdSize) MCE.emitByte(0x67);
+
+ bool Need0FPrefix = false;
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::TB: // Two-byte opcode prefix
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ Need0FPrefix = true;
+ break;
+ case X86II::REP: break; // already handled.
+ case X86II::XS: // F3 0F
+ MCE.emitByte(0xF3);
+ Need0FPrefix = true;
+ break;
+ case X86II::XD: // F2 0F
+ MCE.emitByte(0xF2);
+ Need0FPrefix = true;
+ break;
+ case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+ case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+ MCE.emitByte(0xD8+
+ (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+ >> X86II::Op0Shift));
+ break; // Two-byte opcode prefix
+ default: assert(0 && "Invalid prefix!");
+ case 0: break; // No prefix!
+ }
+
+ if (Is64BitMode) {
+ // REX prefix
+ unsigned REX = X86InstrInfo::determineREX(MI);
+ if (REX)
+ MCE.emitByte(0x40 | REX);
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ if (Need0FPrefix)
+ MCE.emitByte(0x0F);
+
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::T8: // 0F 38
+ MCE.emitByte(0x38);
+ break;
+ case X86II::TA: // 0F 3A
+ MCE.emitByte(0x3A);
+ break;
+ }
+
+ // If this is a two-address instruction, skip one of the register operands.
+ unsigned NumOps = Desc->getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+ ++CurOp;
+ else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+ // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+ --NumOps;
+
+ unsigned char BaseOpcode = II->getBaseOpcodeFor(Desc);
+ switch (Desc->TSFlags & X86II::FormMask) {
+ default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!");
+ case X86II::Pseudo:
+ // Remember the current PC offset, this is the PIC relocation
+ // base address.
+ switch (Opcode) {
+ default:
+ assert(0 && "psuedo instructions should be removed before code emission");
+ break;
+ case TargetInstrInfo::INLINEASM: {
+ // We allow inline assembler nodes with empty bodies - they can
+ // implicitly define registers, which is ok for JIT.
+ if (MI.getOperand(0).getSymbolName()[0]) {
+ assert(0 && "JIT does not support inline asm!\n");
+ abort();
+ }
+ break;
+ }
+ case TargetInstrInfo::DBG_LABEL:
+ case TargetInstrInfo::EH_LABEL:
+ MCE.emitLabel(MI.getOperand(0).getImm());
+ break;
+ case TargetInstrInfo::IMPLICIT_DEF:
+ case TargetInstrInfo::DECLARE:
+ case X86::DWARF_LOC:
+ case X86::FP_REG_KILL:
+ break;
+ case X86::MOVPC32r: {
+ // This emits the "call" portion of this pseudo instruction.
+ MCE.emitByte(BaseOpcode);
+ emitConstant(0, X86InstrInfo::sizeOfImm(Desc));
+ // Remember PIC base.
+ PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset();
+ X86JITInfo *JTI = TM.getJITInfo();
+ JTI->setPICBase(MCE.getCurrentPCValue());
+ break;
+ }
+ }
+ CurOp = NumOps;
+ break;
+ case X86II::RawFrm:
+ MCE.emitByte(BaseOpcode);
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+
+ DOUT << "RawFrm CurOp " << CurOp << "\n";
+ DOUT << "isMBB " << MO.isMBB() << "\n";
+ DOUT << "isGlobal " << MO.isGlobal() << "\n";
+ DOUT << "isSymbol " << MO.isSymbol() << "\n";
+ DOUT << "isImm " << MO.isImm() << "\n";
+
+ if (MO.isMBB()) {
+ emitPCRelativeBlockAddress(MO.getMBB());
+ } else if (MO.isGlobal()) {
+ // Assume undefined functions may be outside the Small codespace.
+ bool NeedStub =
+ (Is64BitMode &&
+ (TM.getCodeModel() == CodeModel::Large ||
+ TM.getSubtarget<X86Subtarget>().isTargetDarwin())) ||
+ Opcode == X86::TAILJMPd;
+ emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
+ MO.getOffset(), 0, NeedStub);
+ } else if (MO.isSymbol()) {
+ emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+ } else if (MO.isImm()) {
+ if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
+ // Fix up immediate operand for pc relative calls.
+ intptr_t Imm = (intptr_t)MO.getImm();
+ Imm = Imm - MCE.getCurrentPCValue() - 4;
+ emitConstant(Imm, X86InstrInfo::sizeOfImm(Desc));
+ } else
+ emitConstant(MO.getImm(), X86InstrInfo::sizeOfImm(Desc));
+ } else {
+ assert(0 && "Unknown RawFrm operand!");
+ }
+ }
+ break;
+
+ case X86II::AddRegFrm:
+ MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+ if (MO1.isImm())
+ emitConstant(MO1.getImm(), Size);
+ else {
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ // This should not occur on Darwin for relocatable objects.
+ if (Opcode == X86::MOV64ri)
+ rt = X86::reloc_absolute_dword; // FIXME: add X86II flag?
+ if (MO1.isGlobal()) {
+ bool NeedStub = isa<Function>(MO1.getGlobal());
+ bool Indirect = gvNeedsNonLazyPtr(MO1.getGlobal());
+ emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+ NeedStub, Indirect);
+ } else if (MO1.isSymbol())
+ emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+ else if (MO1.isCPI())
+ emitConstPoolAddress(MO1.getIndex(), rt);
+ else if (MO1.isJTI())
+ emitJumpTableAddress(MO1.getIndex(), rt);
+ }
+ }
+ break;
+
+ case X86II::MRMDestReg: {
+ MCE.emitByte(BaseOpcode);
+ emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+ getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+ CurOp += 2;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(), X86InstrInfo::sizeOfImm(Desc));
+ break;
+ }
+ case X86II::MRMDestMem: {
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp,
+ getX86RegNum(MI.getOperand(CurOp + X86AddrNumOperands)
+ .getReg()));
+ CurOp += X86AddrNumOperands + 1;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(), X86InstrInfo::sizeOfImm(Desc));
+ break;
+ }
+
+ case X86II::MRMSrcReg:
+ MCE.emitByte(BaseOpcode);
+ emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+ getX86RegNum(MI.getOperand(CurOp).getReg()));
+ CurOp += 2;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(),
+ X86InstrInfo::sizeOfImm(Desc));
+ break;
+
+ case X86II::MRMSrcMem: {
+ // FIXME: Maybe lea should have its own form?
+ int AddrOperands;
+ if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+ Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+ AddrOperands = X86AddrNumOperands - 1; // No segment register
+ else
+ AddrOperands = X86AddrNumOperands;
+
+ intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
+ X86InstrInfo::sizeOfImm(Desc) : 0;
+
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
+ PCAdj);
+ CurOp += AddrOperands + 1;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(),
+ X86InstrInfo::sizeOfImm(Desc));
+ break;
+ }
+
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r: {
+ MCE.emitByte(BaseOpcode);
+
+ // Special handling of lfence, mfence, monitor, and mwait.
+ if (Desc->getOpcode() == X86::LFENCE ||
+ Desc->getOpcode() == X86::MFENCE ||
+ Desc->getOpcode() == X86::MONITOR ||
+ Desc->getOpcode() == X86::MWAIT) {
+ emitRegModRMByte((Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+
+ switch (Desc->getOpcode()) {
+ default: break;
+ case X86::MONITOR:
+ MCE.emitByte(0xC8);
+ break;
+ case X86::MWAIT:
+ MCE.emitByte(0xC9);
+ break;
+ }
+ } else {
+ emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+ (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+ }
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+ if (MO1.isImm())
+ emitConstant(MO1.getImm(), Size);
+ else {
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (Opcode == X86::MOV64ri32)
+ rt = X86::reloc_absolute_word; // FIXME: add X86II flag?
+ if (MO1.isGlobal()) {
+ bool NeedStub = isa<Function>(MO1.getGlobal());
+ bool Indirect = gvNeedsNonLazyPtr(MO1.getGlobal());
+ emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+ NeedStub, Indirect);
+ } else if (MO1.isSymbol())
+ emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+ else if (MO1.isCPI())
+ emitConstPoolAddress(MO1.getIndex(), rt);
+ else if (MO1.isJTI())
+ emitJumpTableAddress(MO1.getIndex(), rt);
+ }
+ }
+ break;
+ }
+
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ intptr_t PCAdj = (CurOp + X86AddrNumOperands != NumOps) ?
+ (MI.getOperand(CurOp+X86AddrNumOperands).isImm() ?
+ X86InstrInfo::sizeOfImm(Desc) : 4) : 0;
+
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+ PCAdj);
+ CurOp += X86AddrNumOperands;
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+ unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+ if (MO.isImm())
+ emitConstant(MO.getImm(), Size);
+ else {
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (Opcode == X86::MOV64mi32)
+ rt = X86::reloc_absolute_word; // FIXME: add X86II flag?
+ if (MO.isGlobal()) {
+ bool NeedStub = isa<Function>(MO.getGlobal());
+ bool Indirect = gvNeedsNonLazyPtr(MO.getGlobal());
+ emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
+ NeedStub, Indirect);
+ } else if (MO.isSymbol())
+ emitExternalSymbolAddress(MO.getSymbolName(), rt);
+ else if (MO.isCPI())
+ emitConstPoolAddress(MO.getIndex(), rt);
+ else if (MO.isJTI())
+ emitJumpTableAddress(MO.getIndex(), rt);
+ }
+ }
+ break;
+ }
+
+ case X86II::MRMInitReg:
+ MCE.emitByte(BaseOpcode);
+ // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+ emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+ getX86RegNum(MI.getOperand(CurOp).getReg()));
+ ++CurOp;
+ break;
+ }
+
+ if (!Desc->isVariadic() && CurOp != NumOps) {
+ cerr << "Cannot encode: ";
+ MI.dump();
+ cerr << '\n';
+ abort();
+ }
+}
+
diff --git a/lib/Target/X86/X86CompilationCallback_Win64.asm b/lib/Target/X86/X86CompilationCallback_Win64.asm
new file mode 100644
index 0000000..8002f98
--- /dev/null
+++ b/lib/Target/X86/X86CompilationCallback_Win64.asm
@@ -0,0 +1,67 @@
+;;===-- X86CompilationCallback_Win64.asm - Implement Win64 JIT callback ---===
+;;
+;; The LLVM Compiler Infrastructure
+;;
+;; This file is distributed under the University of Illinois Open Source
+;; License. See LICENSE.TXT for details.
+;;
+;;===----------------------------------------------------------------------===
+;;
+;; This file implements the JIT interfaces for the X86 target.
+;;
+;;===----------------------------------------------------------------------===
+
+extrn X86CompilationCallback2: PROC
+
+.code
+X86CompilationCallback proc
+ push rbp
+
+ ; Save RSP
+ mov rbp, rsp
+
+ ; Save all int arg registers
+ push rcx
+ push rdx
+ push r8
+ push r9
+
+ ; Align stack on 16-byte boundary.
+ and rsp, -16
+
+ ; Save all XMM arg registers
+ sub rsp, 64
+ movaps [rsp], xmm0
+ movaps [rsp+16], xmm1
+ movaps [rsp+32], xmm2
+ movaps [rsp+48], xmm3
+
+ ; JIT callee
+
+ ; Pass prev frame and return address
+ mov rcx, rbp
+ mov rdx, qword ptr [rbp+8]
+ call X86CompilationCallback2
+
+ ; Restore all XMM arg registers
+ movaps xmm3, [rsp+48]
+ movaps xmm2, [rsp+32]
+ movaps xmm1, [rsp+16]
+ movaps xmm0, [rsp]
+
+ ; Restore RSP
+ mov rsp, rbp
+
+ ; Restore all int arg registers
+ sub rsp, 32
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+
+ ; Restore RBP
+ pop rbp
+ ret
+X86CompilationCallback endp
+
+End
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
new file mode 100644
index 0000000..4c3cc82
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -0,0 +1,18 @@
+//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ELFWriterInfo.h"
+using namespace llvm;
+
+X86ELFWriterInfo::X86ELFWriterInfo() : TargetELFWriterInfo(EM_386) {}
+X86ELFWriterInfo::~X86ELFWriterInfo() {}
diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
new file mode 100644
index 0000000..06e051a
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.h
@@ -0,0 +1,29 @@
+//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ELF_WRITER_INFO_H
+#define X86_ELF_WRITER_INFO_H
+
+#include "llvm/Target/TargetELFWriterInfo.h"
+
+namespace llvm {
+
+ class X86ELFWriterInfo : public TargetELFWriterInfo {
+ public:
+ X86ELFWriterInfo();
+ virtual ~X86ELFWriterInfo();
+ };
+
+} // end llvm namespace
+
+#endif // X86_ELF_WRITER_INFO_H
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 0000000..b3667be
--- /dev/null
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,1549 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel : public FastISel {
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// StackPtr - Register used as the stack pointer.
+ ///
+ unsigned StackPtr;
+
+ /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+ /// floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf64;
+ bool X86ScalarSSEf32;
+
+public:
+ explicit X86FastISel(MachineFunction &mf,
+ MachineModuleInfo *mmi,
+ DwarfWriter *dw,
+ DenseMap<const Value *, unsigned> &vm,
+ DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
+ DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+ , SmallSet<Instruction*, 8> &cil
+#endif
+ )
+ : FastISel(mf, mmi, dw, vm, bm, am
+#ifndef NDEBUG
+ , cil
+#endif
+ ) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ }
+
+ virtual bool TargetSelectInstruction(Instruction *I);
+
+#include "X86GenFastISel.inc"
+
+private:
+ bool X86FastEmitCompare(Value *LHS, Value *RHS, MVT VT);
+
+ bool X86FastEmitLoad(MVT VT, const X86AddressMode &AM, unsigned &RR);
+
+ bool X86FastEmitStore(MVT VT, Value *Val,
+ const X86AddressMode &AM);
+ bool X86FastEmitStore(MVT VT, unsigned Val,
+ const X86AddressMode &AM);
+
+ bool X86FastEmitExtend(ISD::NodeType Opc, MVT DstVT, unsigned Src, MVT SrcVT,
+ unsigned &ResultReg);
+
+ bool X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall);
+
+ bool X86SelectLoad(Instruction *I);
+
+ bool X86SelectStore(Instruction *I);
+
+ bool X86SelectCmp(Instruction *I);
+
+ bool X86SelectZExt(Instruction *I);
+
+ bool X86SelectBranch(Instruction *I);
+
+ bool X86SelectShift(Instruction *I);
+
+ bool X86SelectSelect(Instruction *I);
+
+ bool X86SelectTrunc(Instruction *I);
+
+ bool X86SelectFPExt(Instruction *I);
+ bool X86SelectFPTrunc(Instruction *I);
+
+ bool X86SelectExtractValue(Instruction *I);
+
+ bool X86VisitIntrinsicCall(IntrinsicInst &I);
+ bool X86SelectCall(Instruction *I);
+
+ CCAssignFn *CCAssignFnForCall(unsigned CC, bool isTailCall = false);
+
+ const X86InstrInfo *getInstrInfo() const {
+ return getTargetMachine()->getInstrInfo();
+ }
+ const X86TargetMachine *getTargetMachine() const {
+ return static_cast<const X86TargetMachine *>(&TM);
+ }
+
+ unsigned TargetMaterializeConstant(Constant *C);
+
+ unsigned TargetMaterializeAlloca(AllocaInst *C);
+
+ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+ /// computed in an SSE register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(MVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false);
+};
+
+} // end anonymous namespace.
+
+bool X86FastISel::isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1) {
+ VT = TLI.getValueType(Ty, /*HandleUnknown=*/true);
+ if (VT == MVT::Other || !VT.isSimple())
+ // Unhandled type. Halt "fast" selection and bail.
+ return false;
+
+ // For now, require SSE/SSE2 for performing floating-point operations,
+ // since x87 requires additional work.
+ if (VT == MVT::f64 && !X86ScalarSSEf64)
+ return false;
+ if (VT == MVT::f32 && !X86ScalarSSEf32)
+ return false;
+ // Similarly, no f80 support yet.
+ if (VT == MVT::f80)
+ return false;
+ // We only handle legal types. For example, on x86-32 the instruction
+ // selector contains all of the 64-bit instructions from x86-64,
+ // under the assumption that i64 won't be used if the target doesn't
+ // support it.
+ return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// CCAssignFnForCall - Selects the correct CCAssignFn for a given calling
+/// convention.
+CCAssignFn *X86FastISel::CCAssignFnForCall(unsigned CC, bool isTaillCall) {
+ if (Subtarget->is64Bit()) {
+ if (Subtarget->isTargetWin64())
+ return CC_X86_Win64_C;
+ else if (CC == CallingConv::Fast && isTaillCall)
+ return CC_X86_64_TailCall;
+ else
+ return CC_X86_64_C;
+ }
+
+ if (CC == CallingConv::X86_FastCall)
+ return CC_X86_32_FastCall;
+ else if (CC == CallingConv::Fast)
+ return CC_X86_32_FastCC;
+ else
+ return CC_X86_32_C;
+}
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(MVT VT, const X86AddressMode &AM,
+ unsigned &ResultReg) {
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ switch (VT.getSimpleVT()) {
+ default: return false;
+ case MVT::i8:
+ Opc = X86::MOV8rm;
+ RC = X86::GR8RegisterClass;
+ break;
+ case MVT::i16:
+ Opc = X86::MOV16rm;
+ RC = X86::GR16RegisterClass;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32rm;
+ RC = X86::GR32RegisterClass;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = X86::MOV64rm;
+ RC = X86::GR64RegisterClass;
+ break;
+ case MVT::f32:
+ if (Subtarget->hasSSE1()) {
+ Opc = X86::MOVSSrm;
+ RC = X86::FR32RegisterClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = X86::RFP32RegisterClass;
+ }
+ break;
+ case MVT::f64:
+ if (Subtarget->hasSSE2()) {
+ Opc = X86::MOVSDrm;
+ RC = X86::FR64RegisterClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = X86::RFP64RegisterClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return false;
+ }
+
+ ResultReg = createResultReg(RC);
+ addFullAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+ return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool
+X86FastISel::X86FastEmitStore(MVT VT, unsigned Val,
+ const X86AddressMode &AM) {
+ // Get opcode and regclass of the output for the given store instruction.
+ unsigned Opc = 0;
+ switch (VT.getSimpleVT()) {
+ case MVT::f80: // No f80 support yet.
+ default: return false;
+ case MVT::i8: Opc = X86::MOV8mr; break;
+ case MVT::i16: Opc = X86::MOV16mr; break;
+ case MVT::i32: Opc = X86::MOV32mr; break;
+ case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
+ case MVT::f32:
+ Opc = Subtarget->hasSSE1() ? X86::MOVSSmr : X86::ST_Fp32m;
+ break;
+ case MVT::f64:
+ Opc = Subtarget->hasSSE2() ? X86::MOVSDmr : X86::ST_Fp64m;
+ break;
+ }
+
+ addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM).addReg(Val);
+ return true;
+}
+
+bool X86FastISel::X86FastEmitStore(MVT VT, Value *Val,
+ const X86AddressMode &AM) {
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Val))
+ Val = Constant::getNullValue(TD.getIntPtrType());
+
+ // If this is a store of a simple constant, fold the constant into the store.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+ unsigned Opc = 0;
+ switch (VT.getSimpleVT()) {
+ default: break;
+ case MVT::i8: Opc = X86::MOV8mi; break;
+ case MVT::i16: Opc = X86::MOV16mi; break;
+ case MVT::i32: Opc = X86::MOV32mi; break;
+ case MVT::i64:
+ // Must be a 32-bit sign extended value.
+ if ((int)CI->getSExtValue() == CI->getSExtValue())
+ Opc = X86::MOV64mi32;
+ break;
+ }
+
+ if (Opc) {
+ addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM)
+ .addImm(CI->getSExtValue());
+ return true;
+ }
+ }
+
+ unsigned ValReg = getRegForValue(Val);
+ if (ValReg == 0)
+ return false;
+
+ return X86FastEmitStore(VT, ValReg, AM);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, MVT DstVT,
+ unsigned Src, MVT SrcVT,
+ unsigned &ResultReg) {
+ unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src);
+
+ if (RR != 0) {
+ ResultReg = RR;
+ return true;
+ } else
+ return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM, bool isCall) {
+ User *U;
+ unsigned Opcode = Instruction::UserOp1;
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ Opcode = I->getOpcode();
+ U = I;
+ } else if (ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts.
+ return X86SelectAddress(U->getOperand(0), AM, isCall);
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+ return X86SelectAddress(U->getOperand(0), AM, isCall);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+ return X86SelectAddress(U->getOperand(0), AM, isCall);
+ break;
+
+ case Instruction::Alloca: {
+ if (isCall) break;
+ // Do static allocas.
+ const AllocaInst *A = cast<AllocaInst>(V);
+ DenseMap<const AllocaInst*, int>::iterator SI = StaticAllocaMap.find(A);
+ if (SI != StaticAllocaMap.end()) {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = SI->second;
+ return true;
+ }
+ break;
+ }
+
+ case Instruction::Add: {
+ if (isCall) break;
+ // Adds of constants are common and easy enough.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+ uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+ // They have to fit in the 32-bit signed displacement field though.
+ if (isInt32(Disp)) {
+ AM.Disp = (uint32_t)Disp;
+ return X86SelectAddress(U->getOperand(0), AM, isCall);
+ }
+ }
+ break;
+ }
+
+ case Instruction::GetElementPtr: {
+ if (isCall) break;
+ // Pattern-match simple GEPs.
+ uint64_t Disp = (int32_t)AM.Disp;
+ unsigned IndexReg = AM.IndexReg;
+ unsigned Scale = AM.Scale;
+ gep_type_iterator GTI = gep_type_begin(U);
+ // Iterate through the indices, folding what we can. Constants can be
+ // folded, and one dynamic index can be handled, if the scale is supported.
+ for (User::op_iterator i = U->op_begin() + 1, e = U->op_end();
+ i != e; ++i, ++GTI) {
+ Value *Op = *i;
+ if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+ const StructLayout *SL = TD.getStructLayout(STy);
+ unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+ Disp += SL->getElementOffset(Idx);
+ } else {
+ uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ Disp += CI->getSExtValue() * S;
+ } else if (IndexReg == 0 &&
+ (!AM.GV ||
+ !getTargetMachine()->symbolicAddressesAreRIPRel()) &&
+ (S == 1 || S == 2 || S == 4 || S == 8)) {
+ // Scaled-index addressing.
+ Scale = S;
+ IndexReg = getRegForGEPIndex(Op);
+ if (IndexReg == 0)
+ return false;
+ } else
+ // Unsupported.
+ goto unsupported_gep;
+ }
+ }
+ // Check for displacement overflow.
+ if (!isInt32(Disp))
+ break;
+ // Ok, the GEP indices were covered by constant-offset and scaled-index
+ // addressing. Update the address state and move on to examining the base.
+ AM.IndexReg = IndexReg;
+ AM.Scale = Scale;
+ AM.Disp = (uint32_t)Disp;
+ return X86SelectAddress(U->getOperand(0), AM, isCall);
+ unsupported_gep:
+ // Ok, the GEP indices weren't all covered.
+ break;
+ }
+ }
+
+ // Handle constant address.
+ if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Default &&
+ TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // RIP-relative addresses can't have additional register operands.
+ if (getTargetMachine()->symbolicAddressesAreRIPRel() &&
+ (AM.Base.Reg != 0 || AM.IndexReg != 0))
+ return false;
+
+ // Can't handle TLS yet.
+ if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+ if (GVar->isThreadLocal())
+ return false;
+
+ // Set up the basic address.
+ AM.GV = GV;
+ if (!isCall &&
+ TM.getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->is64Bit())
+ AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(&MF);
+
+ // Emit an extra load if the ABI requires it.
+ if (Subtarget->GVRequiresExtraLoad(GV, TM, isCall)) {
+ // Check to see if we've already materialized this
+ // value in a register in this block.
+ if (unsigned Reg = LocalValueMap[V]) {
+ AM.Base.Reg = Reg;
+ AM.GV = 0;
+ return true;
+ }
+ // Issue load from stub if necessary.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ if (TLI.getPointerTy() == MVT::i32) {
+ Opc = X86::MOV32rm;
+ RC = X86::GR32RegisterClass;
+ } else {
+ Opc = X86::MOV64rm;
+ RC = X86::GR64RegisterClass;
+ }
+
+ X86AddressMode StubAM;
+ StubAM.Base.Reg = AM.Base.Reg;
+ StubAM.GV = AM.GV;
+ unsigned ResultReg = createResultReg(RC);
+ addFullAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), StubAM);
+
+ // Now construct the final address. Note that the Disp, Scale,
+ // and Index values may already be set here.
+ AM.Base.Reg = ResultReg;
+ AM.GV = 0;
+
+ // Prevent loading GV stub multiple times in same MBB.
+ LocalValueMap[V] = AM.Base.Reg;
+ }
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !getTargetMachine()->symbolicAddressesAreRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(Instruction* I) {
+ MVT VT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+ return false;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(I->getOperand(1), AM, false))
+ return false;
+
+ return X86FastEmitStore(VT, I->getOperand(0), AM);
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(Instruction *I) {
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(I->getOperand(0), AM, false))
+ return false;
+
+ unsigned ResultReg = 0;
+ if (X86FastEmitLoad(VT, AM, ResultReg)) {
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ return false;
+}
+
+static unsigned X86ChooseCmpOpcode(MVT VT) {
+ switch (VT.getSimpleVT()) {
+ default: return 0;
+ case MVT::i8: return X86::CMP8rr;
+ case MVT::i16: return X86::CMP16rr;
+ case MVT::i32: return X86::CMP32rr;
+ case MVT::i64: return X86::CMP64rr;
+ case MVT::f32: return X86::UCOMISSrr;
+ case MVT::f64: return X86::UCOMISDrr;
+ }
+}
+
+/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
+/// of the comparison, return an opcode that works for the compare (e.g.
+/// CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(MVT VT, ConstantInt *RHSC) {
+ switch (VT.getSimpleVT()) {
+ // Otherwise, we can't fold the immediate into this comparison.
+ default: return 0;
+ case MVT::i8: return X86::CMP8ri;
+ case MVT::i16: return X86::CMP16ri;
+ case MVT::i32: return X86::CMP32ri;
+ case MVT::i64:
+ // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+ // field.
+ if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
+ return X86::CMP64ri32;
+ return 0;
+ }
+}
+
+bool X86FastISel::X86FastEmitCompare(Value *Op0, Value *Op1, MVT VT) {
+ unsigned Op0Reg = getRegForValue(Op0);
+ if (Op0Reg == 0) return false;
+
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Op1))
+ Op1 = Constant::getNullValue(TD.getIntPtrType());
+
+ // We have two options: compare with register or immediate. If the RHS of
+ // the compare is an immediate that we can fold into this compare, use
+ // CMPri, otherwise use CMPrr.
+ if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+ if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+ BuildMI(MBB, DL, TII.get(CompareImmOpc)).addReg(Op0Reg)
+ .addImm(Op1C->getSExtValue());
+ return true;
+ }
+ }
+
+ unsigned CompareOpc = X86ChooseCmpOpcode(VT);
+ if (CompareOpc == 0) return false;
+
+ unsigned Op1Reg = getRegForValue(Op1);
+ if (Op1Reg == 0) return false;
+ BuildMI(MBB, DL, TII.get(CompareOpc)).addReg(Op0Reg).addReg(Op1Reg);
+
+ return true;
+}
+
+bool X86FastISel::X86SelectCmp(Instruction *I) {
+ CmpInst *CI = cast<CmpInst>(I);
+
+ MVT VT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+ return false;
+
+ unsigned ResultReg = createResultReg(&X86::GR8RegClass);
+ unsigned SetCCOpc;
+ bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0.
+ switch (CI->getPredicate()) {
+ case CmpInst::FCMP_OEQ: {
+ if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+ return false;
+
+ unsigned EReg = createResultReg(&X86::GR8RegClass);
+ unsigned NPReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(MBB, DL, TII.get(X86::SETEr), EReg);
+ BuildMI(MBB, DL, TII.get(X86::SETNPr), NPReg);
+ BuildMI(MBB, DL,
+ TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ case CmpInst::FCMP_UNE: {
+ if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+ return false;
+
+ unsigned NEReg = createResultReg(&X86::GR8RegClass);
+ unsigned PReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(MBB, DL, TII.get(X86::SETNEr), NEReg);
+ BuildMI(MBB, DL, TII.get(X86::SETPr), PReg);
+ BuildMI(MBB, DL, TII.get(X86::OR8rr), ResultReg).addReg(PReg).addReg(NEReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr; break;
+ case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+ case CmpInst::FCMP_OLT: SwapArgs = true; SetCCOpc = X86::SETAr; break;
+ case CmpInst::FCMP_OLE: SwapArgs = true; SetCCOpc = X86::SETAEr; break;
+ case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+ case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break;
+ case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr; break;
+ case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr; break;
+ case CmpInst::FCMP_UGT: SwapArgs = true; SetCCOpc = X86::SETBr; break;
+ case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break;
+ case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break;
+ case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+
+ case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break;
+ case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+ case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break;
+ case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+ case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break;
+ case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+ case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr; break;
+ case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break;
+ case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr; break;
+ case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break;
+ default:
+ return false;
+ }
+
+ Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+ if (SwapArgs)
+ std::swap(Op0, Op1);
+
+ // Emit a compare of Op0/Op1.
+ if (!X86FastEmitCompare(Op0, Op1, VT))
+ return false;
+
+ BuildMI(MBB, DL, TII.get(SetCCOpc), ResultReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectZExt(Instruction *I) {
+ // Handle zero-extension from i1 to i8, which is common.
+ if (I->getType() == Type::Int8Ty &&
+ I->getOperand(0)->getType() == Type::Int1Ty) {
+ unsigned ResultReg = getRegForValue(I->getOperand(0));
+ if (ResultReg == 0) return false;
+ // Set the high bits to zero.
+ ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg);
+ if (ResultReg == 0) return false;
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+
+ return false;
+}
+
+
+bool X86FastISel::X86SelectBranch(Instruction *I) {
+ // Unconditional branches are selected by tablegen-generated code.
+ // Handle a conditional branch.
+ BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *TrueMBB = MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FalseMBB = MBBMap[BI->getSuccessor(1)];
+
+ // Fold the common case of a conditional branch with a comparison.
+ if (CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (CI->hasOneUse()) {
+ MVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+
+ // Try to take advantage of fallthrough opportunities.
+ CmpInst::Predicate Predicate = CI->getPredicate();
+ if (MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0.
+ unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA"
+
+ switch (Predicate) {
+ case CmpInst::FCMP_OEQ:
+ std::swap(TrueMBB, FalseMBB);
+ Predicate = CmpInst::FCMP_UNE;
+ // FALL THROUGH
+ case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE; break;
+ case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA; break;
+ case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE; break;
+ case CmpInst::FCMP_OLT: SwapArgs = true; BranchOpc = X86::JA; break;
+ case CmpInst::FCMP_OLE: SwapArgs = true; BranchOpc = X86::JAE; break;
+ case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE; break;
+ case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP; break;
+ case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP; break;
+ case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE; break;
+ case CmpInst::FCMP_UGT: SwapArgs = true; BranchOpc = X86::JB; break;
+ case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE; break;
+ case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB; break;
+ case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE; break;
+
+ case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE; break;
+ case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE; break;
+ case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA; break;
+ case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE; break;
+ case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB; break;
+ case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE; break;
+ case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG; break;
+ case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE; break;
+ case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL; break;
+ case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE; break;
+ default:
+ return false;
+ }
+
+ Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+ if (SwapArgs)
+ std::swap(Op0, Op1);
+
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(Op0, Op1, VT))
+ return false;
+
+ BuildMI(MBB, DL, TII.get(BranchOpc)).addMBB(TrueMBB);
+
+ if (Predicate == CmpInst::FCMP_UNE) {
+ // X86 requires a second branch to handle UNE (and OEQ,
+ // which is mapped to UNE above).
+ BuildMI(MBB, DL, TII.get(X86::JP)).addMBB(TrueMBB);
+ }
+
+ FastEmitBranch(FalseMBB);
+ MBB->addSuccessor(TrueMBB);
+ return true;
+ }
+ } else if (ExtractValueInst *EI =
+ dyn_cast<ExtractValueInst>(BI->getCondition())) {
+ // Check to see if the branch instruction is from an "arithmetic with
+ // overflow" intrinsic. The main way these intrinsics are used is:
+ //
+ // %t = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ // %sum = extractvalue { i32, i1 } %t, 0
+ // %obit = extractvalue { i32, i1 } %t, 1
+ // br i1 %obit, label %overflow, label %normal
+ //
+ // The %sum and %obit are converted in an ADD and a SETO/SETB before
+ // reaching the branch. Therefore, we search backwards through the MBB
+ // looking for the SETO/SETB instruction. If an instruction modifies the
+ // EFLAGS register before we reach the SETO/SETB instruction, then we can't
+ // convert the branch into a JO/JB instruction.
+ if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(EI->getAggregateOperand())){
+ if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow ||
+ CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) {
+ const MachineInstr *SetMI = 0;
+ unsigned Reg = lookUpRegForValue(EI);
+
+ for (MachineBasicBlock::const_reverse_iterator
+ RI = MBB->rbegin(), RE = MBB->rend(); RI != RE; ++RI) {
+ const MachineInstr &MI = *RI;
+
+ if (MI.modifiesRegister(Reg)) {
+ unsigned Src, Dst, SrcSR, DstSR;
+
+ if (getInstrInfo()->isMoveInstr(MI, Src, Dst, SrcSR, DstSR)) {
+ Reg = Src;
+ continue;
+ }
+
+ SetMI = &MI;
+ break;
+ }
+
+ const TargetInstrDesc &TID = MI.getDesc();
+ if (TID.hasUnmodeledSideEffects() ||
+ TID.hasImplicitDefOfPhysReg(X86::EFLAGS))
+ break;
+ }
+
+ if (SetMI) {
+ unsigned OpCode = SetMI->getOpcode();
+
+ if (OpCode == X86::SETOr || OpCode == X86::SETBr) {
+ BuildMI(MBB, DL, TII.get(OpCode == X86::SETOr ? X86::JO : X86::JB))
+ .addMBB(TrueMBB);
+ FastEmitBranch(FalseMBB);
+ MBB->addSuccessor(TrueMBB);
+ return true;
+ }
+ }
+ }
+ }
+ }
+
+ // Otherwise do a clumsy setcc and re-test it.
+ unsigned OpReg = getRegForValue(BI->getCondition());
+ if (OpReg == 0) return false;
+
+ BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(OpReg).addReg(OpReg);
+ BuildMI(MBB, DL, TII.get(X86::JNE)).addMBB(TrueMBB);
+ FastEmitBranch(FalseMBB);
+ MBB->addSuccessor(TrueMBB);
+ return true;
+}
+
+bool X86FastISel::X86SelectShift(Instruction *I) {
+ unsigned CReg = 0, OpReg = 0, OpImm = 0;
+ const TargetRegisterClass *RC = NULL;
+ if (I->getType() == Type::Int8Ty) {
+ CReg = X86::CL;
+ RC = &X86::GR8RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR8rCL; OpImm = X86::SHR8ri; break;
+ case Instruction::AShr: OpReg = X86::SAR8rCL; OpImm = X86::SAR8ri; break;
+ case Instruction::Shl: OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break;
+ default: return false;
+ }
+ } else if (I->getType() == Type::Int16Ty) {
+ CReg = X86::CX;
+ RC = &X86::GR16RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR16rCL; OpImm = X86::SHR16ri; break;
+ case Instruction::AShr: OpReg = X86::SAR16rCL; OpImm = X86::SAR16ri; break;
+ case Instruction::Shl: OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break;
+ default: return false;
+ }
+ } else if (I->getType() == Type::Int32Ty) {
+ CReg = X86::ECX;
+ RC = &X86::GR32RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR32rCL; OpImm = X86::SHR32ri; break;
+ case Instruction::AShr: OpReg = X86::SAR32rCL; OpImm = X86::SAR32ri; break;
+ case Instruction::Shl: OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break;
+ default: return false;
+ }
+ } else if (I->getType() == Type::Int64Ty) {
+ CReg = X86::RCX;
+ RC = &X86::GR64RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR64rCL; OpImm = X86::SHR64ri; break;
+ case Instruction::AShr: OpReg = X86::SAR64rCL; OpImm = X86::SAR64ri; break;
+ case Instruction::Shl: OpReg = X86::SHL64rCL; OpImm = X86::SHL64ri; break;
+ default: return false;
+ }
+ } else {
+ return false;
+ }
+
+ MVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+ if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0) return false;
+
+ // Fold immediate in shl(x,3).
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(MBB, DL, TII.get(OpImm),
+ ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+ TII.copyRegToReg(*MBB, MBB->end(), CReg, Op1Reg, RC, RC);
+
+ // The shift instruction uses X86::CL. If we defined a super-register
+ // of X86::CL, emit an EXTRACT_SUBREG to precisely describe what
+ // we're doing here.
+ if (CReg != X86::CL)
+ BuildMI(MBB, DL, TII.get(TargetInstrInfo::EXTRACT_SUBREG), X86::CL)
+ .addReg(CReg).addImm(X86::SUBREG_8BIT);
+
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(MBB, DL, TII.get(OpReg), ResultReg).addReg(Op0Reg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectSelect(Instruction *I) {
+ MVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+ if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ if (VT.getSimpleVT() == MVT::i16) {
+ Opc = X86::CMOVE16rr;
+ RC = &X86::GR16RegClass;
+ } else if (VT.getSimpleVT() == MVT::i32) {
+ Opc = X86::CMOVE32rr;
+ RC = &X86::GR32RegClass;
+ } else if (VT.getSimpleVT() == MVT::i64) {
+ Opc = X86::CMOVE64rr;
+ RC = &X86::GR64RegClass;
+ } else {
+ return false;
+ }
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0) return false;
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+ unsigned Op2Reg = getRegForValue(I->getOperand(2));
+ if (Op2Reg == 0) return false;
+
+ BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(Op0Reg).addReg(Op0Reg);
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(MBB, DL, TII.get(Opc), ResultReg).addReg(Op1Reg).addReg(Op2Reg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectFPExt(Instruction *I) {
+ // fpext from float to double.
+ if (Subtarget->hasSSE2() && I->getType() == Type::DoubleTy) {
+ Value *V = I->getOperand(0);
+ if (V->getType() == Type::FloatTy) {
+ unsigned OpReg = getRegForValue(V);
+ if (OpReg == 0) return false;
+ unsigned ResultReg = createResultReg(X86::FR64RegisterClass);
+ BuildMI(MBB, DL, TII.get(X86::CVTSS2SDrr), ResultReg).addReg(OpReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(Instruction *I) {
+ if (Subtarget->hasSSE2()) {
+ if (I->getType() == Type::FloatTy) {
+ Value *V = I->getOperand(0);
+ if (V->getType() == Type::DoubleTy) {
+ unsigned OpReg = getRegForValue(V);
+ if (OpReg == 0) return false;
+ unsigned ResultReg = createResultReg(X86::FR32RegisterClass);
+ BuildMI(MBB, DL, TII.get(X86::CVTSD2SSrr), ResultReg).addReg(OpReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectTrunc(Instruction *I) {
+ if (Subtarget->is64Bit())
+ // All other cases should be handled by the tblgen generated code.
+ return false;
+ MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+ MVT DstVT = TLI.getValueType(I->getType());
+
+ // This code only handles truncation to byte right now.
+ if (DstVT != MVT::i8 && DstVT != MVT::i1)
+ // All other cases should be handled by the tblgen generated code.
+ return false;
+ if (SrcVT != MVT::i16 && SrcVT != MVT::i32)
+ // All other cases should be handled by the tblgen generated code.
+ return false;
+
+ unsigned InputReg = getRegForValue(I->getOperand(0));
+ if (!InputReg)
+ // Unhandled operand. Halt "fast" selection and bail.
+ return false;
+
+ // First issue a copy to GR16_ABCD or GR32_ABCD.
+ unsigned CopyOpc = (SrcVT == MVT::i16) ? X86::MOV16rr : X86::MOV32rr;
+ const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
+ ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
+ unsigned CopyReg = createResultReg(CopyRC);
+ BuildMI(MBB, DL, TII.get(CopyOpc), CopyReg).addReg(InputReg);
+
+ // Then issue an extract_subreg.
+ unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
+ CopyReg, X86::SUBREG_8BIT);
+ if (!ResultReg)
+ return false;
+
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectExtractValue(Instruction *I) {
+ ExtractValueInst *EI = cast<ExtractValueInst>(I);
+ Value *Agg = EI->getAggregateOperand();
+
+ if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Agg)) {
+ switch (CI->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ // Cheat a little. We know that the registers for "add" and "seto" are
+ // allocated sequentially. However, we only keep track of the register
+ // for "add" in the value map. Use extractvalue's index to get the
+ // correct register for "seto".
+ UpdateValueMap(I, lookUpRegForValue(Agg) + *EI->idx_begin());
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86VisitIntrinsicCall(IntrinsicInst &I) {
+ // FIXME: Handle more intrinsics.
+ switch (I.getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow: {
+ // Replace "add with overflow" intrinsics with an "add" instruction followed
+ // by a seto/setc instruction. Later on, when the "extractvalue"
+ // instructions are encountered, we use the fact that two registers were
+ // created sequentially to get the correct registers for the "sum" and the
+ // "overflow bit".
+ const Function *Callee = I.getCalledFunction();
+ const Type *RetTy =
+ cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ Value *Op1 = I.getOperand(1);
+ Value *Op2 = I.getOperand(2);
+ unsigned Reg1 = getRegForValue(Op1);
+ unsigned Reg2 = getRegForValue(Op2);
+
+ if (Reg1 == 0 || Reg2 == 0)
+ // FIXME: Handle values *not* in registers.
+ return false;
+
+ unsigned OpC = 0;
+ if (VT == MVT::i32)
+ OpC = X86::ADD32rr;
+ else if (VT == MVT::i64)
+ OpC = X86::ADD64rr;
+ else
+ return false;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(MBB, DL, TII.get(OpC), ResultReg).addReg(Reg1).addReg(Reg2);
+ unsigned DestReg1 = UpdateValueMap(&I, ResultReg);
+
+ // If the add with overflow is an intra-block value then we just want to
+ // create temporaries for it like normal. If it is a cross-block value then
+ // UpdateValueMap will return the cross-block register used. Since we
+ // *really* want the value to be live in the register pair known by
+ // UpdateValueMap, we have to use DestReg1+1 as the destination register in
+ // the cross block case. In the non-cross-block case, we should just make
+ // another register for the value.
+ if (DestReg1 != ResultReg)
+ ResultReg = DestReg1+1;
+ else
+ ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8));
+
+ unsigned Opc = X86::SETBr;
+ if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
+ Opc = X86::SETOr;
+ BuildMI(MBB, DL, TII.get(Opc), ResultReg);
+ return true;
+ }
+ }
+}
+
+bool X86FastISel::X86SelectCall(Instruction *I) {
+ CallInst *CI = cast<CallInst>(I);
+ Value *Callee = I->getOperand(0);
+
+ // Can't handle inline asm yet.
+ if (isa<InlineAsm>(Callee))
+ return false;
+
+ // Handle intrinsic calls.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI))
+ return X86VisitIntrinsicCall(*II);
+
+ // Handle only C and fastcc calling conventions for now.
+ CallSite CS(CI);
+ unsigned CC = CS.getCallingConv();
+ if (CC != CallingConv::C &&
+ CC != CallingConv::Fast &&
+ CC != CallingConv::X86_FastCall)
+ return false;
+
+ // On X86, -tailcallopt changes the fastcc ABI. FastISel doesn't
+ // handle this for now.
+ if (CC == CallingConv::Fast && PerformTailCallOpt)
+ return false;
+
+ // Let SDISel handle vararg functions.
+ const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+ const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+ if (FTy->isVarArg())
+ return false;
+
+ // Handle *simple* calls for now.
+ const Type *RetTy = CS.getType();
+ MVT RetVT;
+ if (RetTy == Type::VoidTy)
+ RetVT = MVT::isVoid;
+ else if (!isTypeLegal(RetTy, RetVT, true))
+ return false;
+
+ // Materialize callee address in a register. FIXME: GV address can be
+ // handled with a CALLpcrel32 instead.
+ X86AddressMode CalleeAM;
+ if (!X86SelectAddress(Callee, CalleeAM, true))
+ return false;
+ unsigned CalleeOp = 0;
+ GlobalValue *GV = 0;
+ if (CalleeAM.Base.Reg != 0) {
+ assert(CalleeAM.GV == 0);
+ CalleeOp = CalleeAM.Base.Reg;
+ } else if (CalleeAM.GV != 0) {
+ assert(CalleeAM.GV != 0);
+ GV = CalleeAM.GV;
+ } else
+ return false;
+
+ // Allow calls which produce i1 results.
+ bool AndToI1 = false;
+ if (RetVT == MVT::i1) {
+ RetVT = MVT::i8;
+ AndToI1 = true;
+ }
+
+ // Deal with call operands first.
+ SmallVector<Value*, 8> ArgVals;
+ SmallVector<unsigned, 8> Args;
+ SmallVector<MVT, 8> ArgVTs;
+ SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+ Args.reserve(CS.arg_size());
+ ArgVals.reserve(CS.arg_size());
+ ArgVTs.reserve(CS.arg_size());
+ ArgFlags.reserve(CS.arg_size());
+ for (CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+ i != e; ++i) {
+ unsigned Arg = getRegForValue(*i);
+ if (Arg == 0)
+ return false;
+ ISD::ArgFlagsTy Flags;
+ unsigned AttrInd = i - CS.arg_begin() + 1;
+ if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+ Flags.setSExt();
+ if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+ Flags.setZExt();
+
+ // FIXME: Only handle *easy* calls for now.
+ if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+ CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+ CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+ CS.paramHasAttr(AttrInd, Attribute::ByVal))
+ return false;
+
+ const Type *ArgTy = (*i)->getType();
+ MVT ArgVT;
+ if (!isTypeLegal(ArgTy, ArgVT))
+ return false;
+ unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+ Flags.setOrigAlign(OriginalAlignment);
+
+ Args.push_back(Arg);
+ ArgVals.push_back(*i);
+ ArgVTs.push_back(ArgVT);
+ ArgFlags.push_back(Flags);
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, false, TM, ArgLocs);
+ CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ // Issue CALLSEQ_START
+ unsigned AdjStackDown = TM.getRegisterInfo()->getCallFrameSetupOpcode();
+ BuildMI(MBB, DL, TII.get(AdjStackDown)).addImm(NumBytes);
+
+ // Process argument: walk the register/memloc assignments, inserting
+ // copies / loads.
+ SmallVector<unsigned, 4> RegArgs;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ unsigned Arg = Args[VA.getValNo()];
+ MVT ArgVT = ArgVTs[VA.getValNo()];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: assert(0 && "Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt: {
+ bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ assert(Emitted && "Failed to emit a sext!"); Emitted=Emitted;
+ Emitted = true;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::ZExt: {
+ bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ assert(Emitted && "Failed to emit a zext!"); Emitted=Emitted;
+ Emitted = true;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::AExt: {
+ bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+
+ assert(Emitted && "Failed to emit a aext!"); Emitted=Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ }
+
+ if (VA.isRegLoc()) {
+ TargetRegisterClass* RC = TLI.getRegClassFor(ArgVT);
+ bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), VA.getLocReg(),
+ Arg, RC, RC);
+ assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+ Emitted = true;
+ RegArgs.push_back(VA.getLocReg());
+ } else {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ X86AddressMode AM;
+ AM.Base.Reg = StackPtr;
+ AM.Disp = LocMemOffset;
+ Value *ArgVal = ArgVals[VA.getValNo()];
+
+ // If this is a really simple value, emit this with the Value* version of
+ // X86FastEmitStore. If it isn't simple, we don't want to do this, as it
+ // can cause us to reevaluate the argument.
+ if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal))
+ X86FastEmitStore(ArgVT, ArgVal, AM);
+ else
+ X86FastEmitStore(ArgVT, Arg, AM);
+ }
+ }
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (!Subtarget->is64Bit() &&
+ TM.getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT()) {
+ TargetRegisterClass *RC = X86::GR32RegisterClass;
+ unsigned Base = getInstrInfo()->getGlobalBaseReg(&MF);
+ bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), X86::EBX, Base, RC, RC);
+ assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+ Emitted = true;
+ }
+
+ // Issue the call.
+ unsigned CallOpc = CalleeOp
+ ? (Subtarget->is64Bit() ? X86::CALL64r : X86::CALL32r)
+ : (Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32);
+ MachineInstrBuilder MIB = CalleeOp
+ ? BuildMI(MBB, DL, TII.get(CallOpc)).addReg(CalleeOp)
+ : BuildMI(MBB, DL, TII.get(CallOpc)).addGlobalAddress(GV);
+
+ // Add an implicit use GOT pointer in EBX.
+ if (!Subtarget->is64Bit() &&
+ TM.getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT())
+ MIB.addReg(X86::EBX);
+
+ // Add implicit physical register uses to the call.
+ for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+ MIB.addReg(RegArgs[i]);
+
+ // Issue CALLSEQ_END
+ unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode();
+ BuildMI(MBB, DL, TII.get(AdjStackUp)).addImm(NumBytes).addImm(0);
+
+ // Now handle call return value (if any).
+ if (RetVT.getSimpleVT() != MVT::isVoid) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, false, TM, RVLocs);
+ CCInfo.AnalyzeCallResult(RetVT, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ assert(RVLocs.size() == 1 && "Can't handle multi-value calls!");
+ MVT CopyVT = RVLocs[0].getValVT();
+ TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+ TargetRegisterClass *SrcRC = DstRC;
+
+ // If this is a call to a function that returns an fp value on the x87 fp
+ // stack, but where we prefer to use the value in xmm registers, copy it
+ // out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+ if ((RVLocs[0].getLocReg() == X86::ST0 ||
+ RVLocs[0].getLocReg() == X86::ST1) &&
+ isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
+ CopyVT = MVT::f80;
+ SrcRC = X86::RSTRegisterClass;
+ DstRC = X86::RFP80RegisterClass;
+ }
+
+ unsigned ResultReg = createResultReg(DstRC);
+ bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+ RVLocs[0].getLocReg(), DstRC, SrcRC);
+ assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+ Emitted = true;
+ if (CopyVT != RVLocs[0].getValVT()) {
+ // Round the F80 the right size, which also moves to the appropriate xmm
+ // register. This is accomplished by storing the F80 value in memory and
+ // then loading it back. Ewww...
+ MVT ResVT = RVLocs[0].getValVT();
+ unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+ unsigned MemSize = ResVT.getSizeInBits()/8;
+ int FI = MFI.CreateStackObject(MemSize, MemSize);
+ addFrameReference(BuildMI(MBB, DL, TII.get(Opc)), FI).addReg(ResultReg);
+ DstRC = ResVT == MVT::f32
+ ? X86::FR32RegisterClass : X86::FR64RegisterClass;
+ Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+ ResultReg = createResultReg(DstRC);
+ addFrameReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg), FI);
+ }
+
+ if (AndToI1) {
+ // Mask out all but lowest bit for some call which produces an i1.
+ unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+ BuildMI(MBB, DL,
+ TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1);
+ ResultReg = AndResult;
+ }
+
+ UpdateValueMap(I, ResultReg);
+ }
+
+ return true;
+}
+
+
+bool
+X86FastISel::TargetSelectInstruction(Instruction *I) {
+ switch (I->getOpcode()) {
+ default: break;
+ case Instruction::Load:
+ return X86SelectLoad(I);
+ case Instruction::Store:
+ return X86SelectStore(I);
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return X86SelectCmp(I);
+ case Instruction::ZExt:
+ return X86SelectZExt(I);
+ case Instruction::Br:
+ return X86SelectBranch(I);
+ case Instruction::Call:
+ return X86SelectCall(I);
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Shl:
+ return X86SelectShift(I);
+ case Instruction::Select:
+ return X86SelectSelect(I);
+ case Instruction::Trunc:
+ return X86SelectTrunc(I);
+ case Instruction::FPExt:
+ return X86SelectFPExt(I);
+ case Instruction::FPTrunc:
+ return X86SelectFPTrunc(I);
+ case Instruction::ExtractValue:
+ return X86SelectExtractValue(I);
+ case Instruction::IntToPtr: // Deliberate fall-through.
+ case Instruction::PtrToInt: {
+ MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+ MVT DstVT = TLI.getValueType(I->getType());
+ if (DstVT.bitsGT(SrcVT))
+ return X86SelectZExt(I);
+ if (DstVT.bitsLT(SrcVT))
+ return X86SelectTrunc(I);
+ unsigned Reg = getRegForValue(I->getOperand(0));
+ if (Reg == 0) return false;
+ UpdateValueMap(I, Reg);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+unsigned X86FastISel::TargetMaterializeConstant(Constant *C) {
+ MVT VT;
+ if (!isTypeLegal(C->getType(), VT))
+ return false;
+
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ switch (VT.getSimpleVT()) {
+ default: return false;
+ case MVT::i8:
+ Opc = X86::MOV8rm;
+ RC = X86::GR8RegisterClass;
+ break;
+ case MVT::i16:
+ Opc = X86::MOV16rm;
+ RC = X86::GR16RegisterClass;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32rm;
+ RC = X86::GR32RegisterClass;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = X86::MOV64rm;
+ RC = X86::GR64RegisterClass;
+ break;
+ case MVT::f32:
+ if (Subtarget->hasSSE1()) {
+ Opc = X86::MOVSSrm;
+ RC = X86::FR32RegisterClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = X86::RFP32RegisterClass;
+ }
+ break;
+ case MVT::f64:
+ if (Subtarget->hasSSE2()) {
+ Opc = X86::MOVSDrm;
+ RC = X86::FR64RegisterClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = X86::RFP64RegisterClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return false;
+ }
+
+ // Materialize addresses with LEA instructions.
+ if (isa<GlobalValue>(C)) {
+ X86AddressMode AM;
+ if (X86SelectAddress(C, AM, false)) {
+ if (TLI.getPointerTy() == MVT::i32)
+ Opc = X86::LEA32r;
+ else
+ Opc = X86::LEA64r;
+ unsigned ResultReg = createResultReg(RC);
+ addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+ return ResultReg;
+ }
+ return 0;
+ }
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = TD.getPrefTypeAlignment(C->getType());
+ if (Align == 0) {
+ // Alignment of vector types. FIXME!
+ Align = TD.getTypeAllocSize(C->getType());
+ }
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ if (TM.getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->is64Bit())
+ PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+
+ // Create the load from the constant pool.
+ unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
+ unsigned ResultReg = createResultReg(RC);
+ addConstantPoolReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg), MCPOffset,
+ PICBase);
+
+ return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeAlloca(AllocaInst *C) {
+ // Fail on dynamic allocas. At this point, getRegForValue has already
+ // checked its CSE maps, so if we're here trying to handle a dynamic
+ // alloca, we're not going to succeed. X86SelectAddress has a
+ // check for dynamic allocas, because it's called directly from
+ // various places, but TargetMaterializeAlloca also needs a check
+ // in order to avoid recursion between getRegForValue,
+ // X86SelectAddrss, and TargetMaterializeAlloca.
+ if (!StaticAllocaMap.count(C))
+ return 0;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(C, AM, false))
+ return 0;
+ unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+ TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+ unsigned ResultReg = createResultReg(RC);
+ addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+ return ResultReg;
+}
+
+namespace llvm {
+ llvm::FastISel *X86::createFastISel(MachineFunction &mf,
+ MachineModuleInfo *mmi,
+ DwarfWriter *dw,
+ DenseMap<const Value *, unsigned> &vm,
+ DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
+ DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+ , SmallSet<Instruction*, 8> &cil
+#endif
+ ) {
+ return new X86FastISel(mf, mmi, dw, vm, bm, am
+#ifndef NDEBUG
+ , cil
+#endif
+ );
+ }
+}
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..0f2fbcc
--- /dev/null
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1187 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// virtual registers into register stack instructions. This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// This pass is hampered by the lack of decent CFG manipulation routines for
+// machine code. In particular, this wants to be able to split critical edges
+// as necessary, traverse the machine basic block CFG in depth-first order, and
+// allow there to be multiple machine basic blocks for each LLVM basicblock
+// (needed for critical edge splitting).
+//
+// In particular, this pass currently barfs on critical edges. Because of this,
+// it requires the instruction selector to insert FP_REG_KILL instructions on
+// the exits of any basic block that has critical edges going from it, or which
+// branch to a critical basic block.
+//
+// FIXME: this is not implemented yet. The stackifier pass only works on local
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP , "Number of floating point instructions");
+
+namespace {
+ struct VISIBILITY_HIDDEN FPS : public MachineFunctionPass {
+ static char ID;
+ FPS() : MachineFunctionPass(&ID) {}
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+
+ private:
+ const TargetInstrInfo *TII; // Machine instruction info.
+ MachineBasicBlock *MBB; // Current basic block
+ unsigned Stack[8]; // FP<n> Registers in each stack slot...
+ unsigned RegMap[8]; // Track which stack slot contains each register
+ unsigned StackTop; // The current top of the FP stack.
+
+ void dumpStack() const {
+ cerr << "Stack contents:";
+ for (unsigned i = 0; i != StackTop; ++i) {
+ cerr << " FP" << Stack[i];
+ assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+ }
+ cerr << "\n";
+ }
+ private:
+ /// isStackEmpty - Return true if the FP stack is empty.
+ bool isStackEmpty() const {
+ return StackTop == 0;
+ }
+
+ // getSlot - Return the stack slot number a particular register number is
+ // in.
+ unsigned getSlot(unsigned RegNo) const {
+ assert(RegNo < 8 && "Regno out of range!");
+ return RegMap[RegNo];
+ }
+
+ // getStackEntry - Return the X86::FP<n> register in register ST(i).
+ unsigned getStackEntry(unsigned STi) const {
+ assert(STi < StackTop && "Access past stack top!");
+ return Stack[StackTop-1-STi];
+ }
+
+ // getSTReg - Return the X86::ST(i) register which contains the specified
+ // FP<RegNo> register.
+ unsigned getSTReg(unsigned RegNo) const {
+ return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0;
+ }
+
+ // pushReg - Push the specified FP<n> register onto the stack.
+ void pushReg(unsigned Reg) {
+ assert(Reg < 8 && "Register number out of range!");
+ assert(StackTop < 8 && "Stack overflow!");
+ Stack[StackTop] = Reg;
+ RegMap[Reg] = StackTop++;
+ }
+
+ bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+ void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+ MachineInstr *MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ if (isAtTop(RegNo)) return;
+
+ unsigned STReg = getSTReg(RegNo);
+ unsigned RegOnTop = getStackEntry(0);
+
+ // Swap the slots the regs are in.
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+ // Swap stack slot contents.
+ assert(RegMap[RegOnTop] < StackTop);
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+ // Emit an fxch to update the runtime processors version of the state.
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+ NumFXCH++;
+ }
+
+ void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+ DebugLoc dl = I->getDebugLoc();
+ unsigned STReg = getSTReg(RegNo);
+ pushReg(AsReg); // New register on top of stack
+
+ BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+ }
+
+ // popStackAfter - Pop the current value off of the top of the FP stack
+ // after the specified instruction.
+ void popStackAfter(MachineBasicBlock::iterator &I);
+
+ // freeStackSlotAfter - Free the specified register from the register stack,
+ // so that it is no longer in a register. If the register is currently at
+ // the top of the stack, we just pop the current instruction, otherwise we
+ // store the current top-of-stack into the specified slot, then pop the top
+ // of stack.
+ void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+ bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ void handleZeroArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+ void handleTwoArgFP(MachineBasicBlock::iterator &I);
+ void handleCompareFP(MachineBasicBlock::iterator &I);
+ void handleCondMovFP(MachineBasicBlock::iterator &I);
+ void handleSpecialFP(MachineBasicBlock::iterator &I);
+ };
+ char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+ assert(MO.isReg() && "Expected an FP register!");
+ unsigned Reg = MO.getReg();
+ assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+ return Reg - X86::FP0;
+}
+
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+ // We only need to run this pass if there are any FP registers used in this
+ // function. If it is all integer, there is nothing for us to do!
+ bool FPIsUsed = false;
+
+ assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+ for (unsigned i = 0; i <= 6; ++i)
+ if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+ FPIsUsed = true;
+ break;
+ }
+
+ // Early exit.
+ if (!FPIsUsed) return false;
+
+ TII = MF.getTarget().getInstrInfo();
+ StackTop = 0;
+
+ // Process the function in depth first order so that we process at least one
+ // of the predecessors for every reachable block in the function.
+ SmallPtrSet<MachineBasicBlock*, 8> Processed;
+ MachineBasicBlock *Entry = MF.begin();
+
+ bool Changed = false;
+ for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> >
+ I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+ I != E; ++I)
+ Changed |= processBasicBlock(MF, **I);
+
+ return Changed;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+ bool Changed = false;
+ MBB = &BB;
+
+ for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+ MachineInstr *MI = I;
+ unsigned Flags = MI->getDesc().TSFlags;
+
+ unsigned FPInstClass = Flags & X86II::FPTypeMask;
+ if (MI->getOpcode() == TargetInstrInfo::INLINEASM)
+ FPInstClass = X86II::SpecialFP;
+
+ if (FPInstClass == X86II::NotFP)
+ continue; // Efficiently ignore non-fp insts!
+
+ MachineInstr *PrevMI = 0;
+ if (I != BB.begin())
+ PrevMI = prior(I);
+
+ ++NumFP; // Keep track of # of pseudo instrs
+ DOUT << "\nFPInst:\t" << *MI;
+
+ // Get dead variables list now because the MI pointer may be deleted as part
+ // of processing!
+ SmallVector<unsigned, 8> DeadRegs;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDead())
+ DeadRegs.push_back(MO.getReg());
+ }
+
+ switch (FPInstClass) {
+ case X86II::ZeroArgFP: handleZeroArgFP(I); break;
+ case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0)
+ case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+ case X86II::TwoArgFP: handleTwoArgFP(I); break;
+ case X86II::CompareFP: handleCompareFP(I); break;
+ case X86II::CondMovFP: handleCondMovFP(I); break;
+ case X86II::SpecialFP: handleSpecialFP(I); break;
+ default: assert(0 && "Unknown FP Type!");
+ }
+
+ // Check to see if any of the values defined by this instruction are dead
+ // after definition. If so, pop them.
+ for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+ unsigned Reg = DeadRegs[i];
+ if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+ DOUT << "Register FP#" << Reg-X86::FP0 << " is dead!\n";
+ freeStackSlotAfter(I, Reg-X86::FP0);
+ }
+ }
+
+ // Print out all of the instructions expanded to if -debug
+ DEBUG(
+ MachineBasicBlock::iterator PrevI(PrevMI);
+ if (I == PrevI) {
+ cerr << "Just deleted pseudo instruction\n";
+ } else {
+ MachineBasicBlock::iterator Start = I;
+ // Rewind to first instruction newly inserted.
+ while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+ cerr << "Inserted instructions:\n\t";
+ Start->print(*cerr.stream(), &MF.getTarget());
+ while (++Start != next(I)) {}
+ }
+ dumpStack();
+ );
+
+ Changed = true;
+ }
+
+ assert(isStackEmpty() && "Stack not empty at end of basic block?");
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+ struct TableEntry {
+ unsigned from;
+ unsigned to;
+ bool operator<(const TableEntry &TE) const { return from < TE.from; }
+ friend bool operator<(const TableEntry &TE, unsigned V) {
+ return TE.from < V;
+ }
+ friend bool operator<(unsigned V, const TableEntry &TE) {
+ return V < TE.from;
+ }
+ };
+}
+
+#ifndef NDEBUG
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+ for (unsigned i = 0; i != NumEntries-1; ++i)
+ if (!(Table[i] < Table[i+1])) return false;
+ return true;
+}
+#endif
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+ const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+ if (I != Table+N && I->from == Opcode)
+ return I->to;
+ return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE) \
+ { static bool TABLE##Checked = false; \
+ if (!TABLE##Checked) { \
+ assert(TableIsSorted(TABLE, array_lengthof(TABLE)) && \
+ "All lookup tables must be sorted for efficient access!"); \
+ TABLE##Checked = true; \
+ } \
+ }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+ { X86::ABS_Fp32 , X86::ABS_F },
+ { X86::ABS_Fp64 , X86::ABS_F },
+ { X86::ABS_Fp80 , X86::ABS_F },
+ { X86::ADD_Fp32m , X86::ADD_F32m },
+ { X86::ADD_Fp64m , X86::ADD_F64m },
+ { X86::ADD_Fp64m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m64 , X86::ADD_F64m },
+ { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+ { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+ { X86::CHS_Fp32 , X86::CHS_F },
+ { X86::CHS_Fp64 , X86::CHS_F },
+ { X86::CHS_Fp80 , X86::CHS_F },
+ { X86::CMOVBE_Fp32 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp64 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp80 , X86::CMOVBE_F },
+ { X86::CMOVB_Fp32 , X86::CMOVB_F },
+ { X86::CMOVB_Fp64 , X86::CMOVB_F },
+ { X86::CMOVB_Fp80 , X86::CMOVB_F },
+ { X86::CMOVE_Fp32 , X86::CMOVE_F },
+ { X86::CMOVE_Fp64 , X86::CMOVE_F },
+ { X86::CMOVE_Fp80 , X86::CMOVE_F },
+ { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+ { X86::CMOVNB_Fp32 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp64 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp80 , X86::CMOVNB_F },
+ { X86::CMOVNE_Fp32 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp64 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp80 , X86::CMOVNE_F },
+ { X86::CMOVNP_Fp32 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp64 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp80 , X86::CMOVNP_F },
+ { X86::CMOVP_Fp32 , X86::CMOVP_F },
+ { X86::CMOVP_Fp64 , X86::CMOVP_F },
+ { X86::CMOVP_Fp80 , X86::CMOVP_F },
+ { X86::COS_Fp32 , X86::COS_F },
+ { X86::COS_Fp64 , X86::COS_F },
+ { X86::COS_Fp80 , X86::COS_F },
+ { X86::DIVR_Fp32m , X86::DIVR_F32m },
+ { X86::DIVR_Fp64m , X86::DIVR_F64m },
+ { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+ { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+ { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+ { X86::DIV_Fp32m , X86::DIV_F32m },
+ { X86::DIV_Fp64m , X86::DIV_F64m },
+ { X86::DIV_Fp64m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m64 , X86::DIV_F64m },
+ { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+ { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+ { X86::ILD_Fp16m32 , X86::ILD_F16m },
+ { X86::ILD_Fp16m64 , X86::ILD_F16m },
+ { X86::ILD_Fp16m80 , X86::ILD_F16m },
+ { X86::ILD_Fp32m32 , X86::ILD_F32m },
+ { X86::ILD_Fp32m64 , X86::ILD_F32m },
+ { X86::ILD_Fp32m80 , X86::ILD_F32m },
+ { X86::ILD_Fp64m32 , X86::ILD_F64m },
+ { X86::ILD_Fp64m64 , X86::ILD_F64m },
+ { X86::ILD_Fp64m80 , X86::ILD_F64m },
+ { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+ { X86::IST_Fp16m32 , X86::IST_F16m },
+ { X86::IST_Fp16m64 , X86::IST_F16m },
+ { X86::IST_Fp16m80 , X86::IST_F16m },
+ { X86::IST_Fp32m32 , X86::IST_F32m },
+ { X86::IST_Fp32m64 , X86::IST_F32m },
+ { X86::IST_Fp32m80 , X86::IST_F32m },
+ { X86::IST_Fp64m32 , X86::IST_FP64m },
+ { X86::IST_Fp64m64 , X86::IST_FP64m },
+ { X86::IST_Fp64m80 , X86::IST_FP64m },
+ { X86::LD_Fp032 , X86::LD_F0 },
+ { X86::LD_Fp064 , X86::LD_F0 },
+ { X86::LD_Fp080 , X86::LD_F0 },
+ { X86::LD_Fp132 , X86::LD_F1 },
+ { X86::LD_Fp164 , X86::LD_F1 },
+ { X86::LD_Fp180 , X86::LD_F1 },
+ { X86::LD_Fp32m , X86::LD_F32m },
+ { X86::LD_Fp32m64 , X86::LD_F32m },
+ { X86::LD_Fp32m80 , X86::LD_F32m },
+ { X86::LD_Fp64m , X86::LD_F64m },
+ { X86::LD_Fp64m80 , X86::LD_F64m },
+ { X86::LD_Fp80m , X86::LD_F80m },
+ { X86::MUL_Fp32m , X86::MUL_F32m },
+ { X86::MUL_Fp64m , X86::MUL_F64m },
+ { X86::MUL_Fp64m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m64 , X86::MUL_F64m },
+ { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+ { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+ { X86::SIN_Fp32 , X86::SIN_F },
+ { X86::SIN_Fp64 , X86::SIN_F },
+ { X86::SIN_Fp80 , X86::SIN_F },
+ { X86::SQRT_Fp32 , X86::SQRT_F },
+ { X86::SQRT_Fp64 , X86::SQRT_F },
+ { X86::SQRT_Fp80 , X86::SQRT_F },
+ { X86::ST_Fp32m , X86::ST_F32m },
+ { X86::ST_Fp64m , X86::ST_F64m },
+ { X86::ST_Fp64m32 , X86::ST_F32m },
+ { X86::ST_Fp80m32 , X86::ST_F32m },
+ { X86::ST_Fp80m64 , X86::ST_F64m },
+ { X86::ST_FpP80m , X86::ST_FP80m },
+ { X86::SUBR_Fp32m , X86::SUBR_F32m },
+ { X86::SUBR_Fp64m , X86::SUBR_F64m },
+ { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+ { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+ { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+ { X86::SUB_Fp32m , X86::SUB_F32m },
+ { X86::SUB_Fp64m , X86::SUB_F64m },
+ { X86::SUB_Fp64m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m64 , X86::SUB_F64m },
+ { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+ { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+ { X86::TST_Fp32 , X86::TST_F },
+ { X86::TST_Fp64 , X86::TST_F },
+ { X86::TST_Fp80 , X86::TST_F },
+ { X86::UCOM_FpIr32 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr64 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr80 , X86::UCOM_FIr },
+ { X86::UCOM_Fpr32 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr64 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr80 , X86::UCOM_Fr },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+ ASSERT_SORTED(OpcodeTable);
+ int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode);
+ assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+ return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version. The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+ { X86::ADD_FrST0 , X86::ADD_FPrST0 },
+
+ { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+ { X86::DIV_FrST0 , X86::DIV_FPrST0 },
+
+ { X86::IST_F16m , X86::IST_FP16m },
+ { X86::IST_F32m , X86::IST_FP32m },
+
+ { X86::MUL_FrST0 , X86::MUL_FPrST0 },
+
+ { X86::ST_F32m , X86::ST_FP32m },
+ { X86::ST_F64m , X86::ST_FP64m },
+ { X86::ST_Frr , X86::ST_FPrr },
+
+ { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+ { X86::SUB_FrST0 , X86::SUB_FPrST0 },
+
+ { X86::UCOM_FIr , X86::UCOM_FIPr },
+
+ { X86::UCOM_FPr , X86::UCOM_FPPr },
+ { X86::UCOM_Fr , X86::UCOM_FPr },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction. This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible. The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+ MachineInstr* MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ ASSERT_SORTED(PopTable);
+ assert(StackTop > 0 && "Cannot pop empty stack!");
+ RegMap[Stack[--StackTop]] = ~0; // Update state
+
+ // Check to see if there is a popping version of this instruction...
+ int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode());
+ if (Opcode != -1) {
+ I->setDesc(TII->get(Opcode));
+ if (Opcode == X86::UCOM_FPPr)
+ I->RemoveOperand(0);
+ } else { // Insert an explicit pop
+ I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+ }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register. If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+ if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy.
+ popStackAfter(I);
+ return;
+ }
+
+ // Otherwise, store the top of stack into the dead slot, killing the operand
+ // without having to add in an explicit xchg then pop.
+ //
+ unsigned STReg = getSTReg(FPRegNo);
+ unsigned OldSlot = getSlot(FPRegNo);
+ unsigned TopReg = Stack[StackTop-1];
+ Stack[OldSlot] = TopReg;
+ RegMap[TopReg] = OldSlot;
+ RegMap[FPRegNo] = ~0;
+ Stack[--StackTop] = ~0;
+ MachineInstr *MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned DestReg = getFPReg(MI->getOperand(0));
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0); // Remove the explicit ST(0) operand
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // Result gets pushed on the stack.
+ pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ assert((NumOps == X86AddrNumOperands + 1 || NumOps == 1) &&
+ "Can only handle fst* & ftst instructions!");
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+ bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+ // FISTP64m is strange because there isn't a non-popping versions.
+ // If we have one _and_ we don't want to pop the operand, duplicate the value
+ // on the stack instead of moving it. This ensure that popping the value is
+ // always ok.
+ // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+ //
+ if (!KillsSrc &&
+ (MI->getOpcode() == X86::IST_Fp64m32 ||
+ MI->getOpcode() == X86::ISTT_Fp16m32 ||
+ MI->getOpcode() == X86::ISTT_Fp32m32 ||
+ MI->getOpcode() == X86::ISTT_Fp64m32 ||
+ MI->getOpcode() == X86::IST_Fp64m64 ||
+ MI->getOpcode() == X86::ISTT_Fp16m64 ||
+ MI->getOpcode() == X86::ISTT_Fp32m64 ||
+ MI->getOpcode() == X86::ISTT_Fp64m64 ||
+ MI->getOpcode() == X86::IST_Fp64m80 ||
+ MI->getOpcode() == X86::ISTT_Fp16m80 ||
+ MI->getOpcode() == X86::ISTT_Fp32m80 ||
+ MI->getOpcode() == X86::ISTT_Fp64m80 ||
+ MI->getOpcode() == X86::ST_FpP80m)) {
+ duplicateToTop(Reg, 7 /*temp register*/, I);
+ } else {
+ moveToTop(Reg, I); // Move to the top of the stack...
+ }
+
+ // Convert from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(NumOps-1); // Remove explicit ST(0) operand
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ if (MI->getOpcode() == X86::IST_FP64m ||
+ MI->getOpcode() == X86::ISTT_FP16m ||
+ MI->getOpcode() == X86::ISTT_FP32m ||
+ MI->getOpcode() == X86::ISTT_FP64m ||
+ MI->getOpcode() == X86::ST_FP80m) {
+ assert(StackTop > 0 && "Stack empty??");
+ --StackTop;
+ } else if (KillsSrc) { // Last use of operand?
+ popStackAfter(I);
+ }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value. These instructions may have
+/// non-fp operands after their FP operands.
+///
+/// Examples:
+/// R1 = fchs R2
+/// R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+#ifndef NDEBUG
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(1));
+ bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+ if (KillsSrc) {
+ // If this is the last use of the source register, just make sure it's on
+ // the top of the stack.
+ moveToTop(Reg, I);
+ assert(StackTop > 0 && "Stack cannot be empty!");
+ --StackTop;
+ pushReg(getFPReg(MI->getOperand(0)));
+ } else {
+ // If this is not the last use of the source register, _copy_ it to the top
+ // of the stack.
+ duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+ }
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(1); // Drop the source operand.
+ MI->RemoveOperand(0); // Drop the destination operand.
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r },
+ { X86::ADD_Fp64 , X86::ADD_FST0r },
+ { X86::ADD_Fp80 , X86::ADD_FST0r },
+ { X86::DIV_Fp32 , X86::DIV_FST0r },
+ { X86::DIV_Fp64 , X86::DIV_FST0r },
+ { X86::DIV_Fp80 , X86::DIV_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r },
+ { X86::MUL_Fp64 , X86::MUL_FST0r },
+ { X86::MUL_Fp80 , X86::MUL_FST0r },
+ { X86::SUB_Fp32 , X86::SUB_FST0r },
+ { X86::SUB_Fp64 , X86::SUB_FST0r },
+ { X86::SUB_Fp80 , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FST0r },
+ { X86::DIV_Fp64 , X86::DIVR_FST0r },
+ { X86::DIV_Fp80 , X86::DIVR_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FST0r },
+ { X86::SUB_Fp64 , X86::SUBR_FST0r },
+ { X86::SUB_Fp80 , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp64 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp80 , X86::DIVR_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp64 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp80 , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 },
+ { X86::ADD_Fp64 , X86::ADD_FrST0 },
+ { X86::ADD_Fp80 , X86::ADD_FrST0 },
+ { X86::DIV_Fp32 , X86::DIV_FrST0 },
+ { X86::DIV_Fp64 , X86::DIV_FrST0 },
+ { X86::DIV_Fp80 , X86::DIV_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 },
+ { X86::MUL_Fp64 , X86::MUL_FrST0 },
+ { X86::MUL_Fp80 , X86::MUL_FrST0 },
+ { X86::SUB_Fp32 , X86::SUB_FrST0 },
+ { X86::SUB_Fp64 , X86::SUB_FrST0 },
+ { X86::SUB_Fp80 , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub ST(0), ST(i)
+/// ST(i) = fsub ST(0), ST(i)
+/// ST(0) = fsubr ST(0), ST(i)
+/// ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getDesc().getNumOperands();
+ assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+ unsigned Dest = getFPReg(MI->getOperand(0));
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+ DebugLoc dl = MI->getDebugLoc();
+
+ unsigned TOS = getStackEntry(0);
+
+ // One of our operands must be on the top of the stack. If neither is yet, we
+ // need to move one.
+ if (Op0 != TOS && Op1 != TOS) { // No operand at TOS?
+ // We can choose to move either operand to the top of the stack. If one of
+ // the operands is killed by this instruction, we want that one so that we
+ // can update right on top of the old version.
+ if (KillsOp0) {
+ moveToTop(Op0, I); // Move dead operand to TOS.
+ TOS = Op0;
+ } else if (KillsOp1) {
+ moveToTop(Op1, I);
+ TOS = Op1;
+ } else {
+ // All of the operands are live after this instruction executes, so we
+ // cannot update on top of any operand. Because of this, we must
+ // duplicate one of the stack elements to the top. It doesn't matter
+ // which one we pick.
+ //
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+ } else if (!KillsOp0 && !KillsOp1) {
+ // If we DO have one of our operands at the top of the stack, but we don't
+ // have a dead operand, we must duplicate one of the operands to a new slot
+ // on the stack.
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+
+ // Now we know that one of our operands is on the top of the stack, and at
+ // least one of our operands is killed by this instruction.
+ assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+ "Stack conditions not set up right!");
+
+ // We decide which form to use based on what is on the top of the stack, and
+ // which operand is killed by this instruction.
+ const TableEntry *InstTable;
+ bool isForward = TOS == Op0;
+ bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+ if (updateST0) {
+ if (isForward)
+ InstTable = ForwardST0Table;
+ else
+ InstTable = ReverseST0Table;
+ } else {
+ if (isForward)
+ InstTable = ForwardSTiTable;
+ else
+ InstTable = ReverseSTiTable;
+ }
+
+ int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table),
+ MI->getOpcode());
+ assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+ // NotTOS - The register which is not on the top of stack...
+ unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+ // Replace the old instruction with a new instruction
+ MBB->remove(I++);
+ I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+ // If both operands are killed, pop one off of the stack in addition to
+ // overwriting the other one.
+ if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+ assert(!updateST0 && "Should have updated other operand!");
+ popStackAfter(I); // Pop the top of stack
+ }
+
+ // Update stack information so that we know the destination register is now on
+ // the stack.
+ unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+ assert(UpdatedSlot < StackTop && Dest < 7);
+ Stack[UpdatedSlot] = Dest;
+ RegMap[Dest] = UpdatedSlot;
+ MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getDesc().getNumOperands();
+ assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+ // Make sure the first operand is on the top of stack, the other one can be
+ // anywhere.
+ moveToTop(Op0, I);
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->RemoveOperand(1);
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If any of the operands are killed by this instruction, free them.
+ if (KillsOp0) freeStackSlotAfter(I, Op0);
+ if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions. These
+/// instructions move a st(i) register to st(0) iff a condition is true. These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+
+ unsigned Op0 = getFPReg(MI->getOperand(0));
+ unsigned Op1 = getFPReg(MI->getOperand(2));
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+ // The first operand *must* be on the top of the stack.
+ moveToTop(Op0, I);
+
+ // Change the second operand to the stack register that the operand is in.
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0);
+ MI->RemoveOperand(1);
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If we kill the second operand, make sure to pop it from the stack.
+ if (Op0 != Op1 && KillsOp1) {
+ // Get this value off of the register stack.
+ freeStackSlotAfter(I, Op1);
+ }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions. This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ switch (MI->getOpcode()) {
+ default: assert(0 && "Unknown SpecialFP instruction!");
+ case X86::FpGET_ST0_32:// Appears immediately after a call returning FP type!
+ case X86::FpGET_ST0_64:// Appears immediately after a call returning FP type!
+ case X86::FpGET_ST0_80:// Appears immediately after a call returning FP type!
+ assert(StackTop == 0 && "Stack should be empty after a call!");
+ pushReg(getFPReg(MI->getOperand(0)));
+ break;
+ case X86::FpGET_ST1_32:// Appears immediately after a call returning FP type!
+ case X86::FpGET_ST1_64:// Appears immediately after a call returning FP type!
+ case X86::FpGET_ST1_80:{// Appears immediately after a call returning FP type!
+ // FpGET_ST1 should occur right after a FpGET_ST0 for a call or inline asm.
+ // The pattern we expect is:
+ // CALL
+ // FP1 = FpGET_ST0
+ // FP4 = FpGET_ST1
+ //
+ // At this point, we've pushed FP1 on the top of stack, so it should be
+ // present if it isn't dead. If it was dead, we already emitted a pop to
+ // remove it from the stack and StackTop = 0.
+
+ // Push FP4 as top of stack next.
+ pushReg(getFPReg(MI->getOperand(0)));
+
+ // If StackTop was 0 before we pushed our operand, then ST(0) must have been
+ // dead. In this case, the ST(1) value is the only thing that is live, so
+ // it should be on the TOS (after the pop that was emitted) and is. Just
+ // continue in this case.
+ if (StackTop == 1)
+ break;
+
+ // Because pushReg just pushed ST(1) as TOS, we now have to swap the two top
+ // elements so that our accounting is correct.
+ unsigned RegOnTop = getStackEntry(0);
+ unsigned RegNo = getStackEntry(1);
+
+ // Swap the slots the regs are in.
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+ // Swap stack slot contents.
+ assert(RegMap[RegOnTop] < StackTop);
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+ break;
+ }
+ case X86::FpSET_ST0_32:
+ case X86::FpSET_ST0_64:
+ case X86::FpSET_ST0_80:
+ assert((StackTop == 1 || StackTop == 2)
+ && "Stack should have one or two element on it to return!");
+ --StackTop; // "Forget" we have something on the top of stack!
+ break;
+ case X86::FpSET_ST1_32:
+ case X86::FpSET_ST1_64:
+ case X86::FpSET_ST1_80:
+ // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
+ if (StackTop == 1) {
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
+ NumFXCH++;
+ StackTop = 0;
+ break;
+ }
+ assert(StackTop == 2 && "Stack should have two element on it to return!");
+ --StackTop; // "Forget" we have something on the top of stack!
+ break;
+ case X86::MOV_Fp3232:
+ case X86::MOV_Fp3264:
+ case X86::MOV_Fp6432:
+ case X86::MOV_Fp6464:
+ case X86::MOV_Fp3280:
+ case X86::MOV_Fp6480:
+ case X86::MOV_Fp8032:
+ case X86::MOV_Fp8064:
+ case X86::MOV_Fp8080: {
+ const MachineOperand &MO1 = MI->getOperand(1);
+ unsigned SrcReg = getFPReg(MO1);
+
+ const MachineOperand &MO0 = MI->getOperand(0);
+ // These can be created due to inline asm. Two address pass can introduce
+ // copies from RFP registers to virtual registers.
+ if (MO0.getReg() == X86::ST0 && SrcReg == 0) {
+ assert(MO1.isKill());
+ // Treat %ST0<def> = MOV_Fp8080 %FP0<kill>
+ // like FpSET_ST0_80 %FP0<kill>, %ST0<imp-def>
+ assert((StackTop == 1 || StackTop == 2)
+ && "Stack should have one or two element on it to return!");
+ --StackTop; // "Forget" we have something on the top of stack!
+ break;
+ } else if (MO0.getReg() == X86::ST1 && SrcReg == 1) {
+ assert(MO1.isKill());
+ // Treat %ST1<def> = MOV_Fp8080 %FP1<kill>
+ // like FpSET_ST1_80 %FP0<kill>, %ST1<imp-def>
+ // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
+ if (StackTop == 1) {
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
+ NumFXCH++;
+ StackTop = 0;
+ break;
+ }
+ assert(StackTop == 2 && "Stack should have two element on it to return!");
+ --StackTop; // "Forget" we have something on the top of stack!
+ break;
+ }
+
+ unsigned DestReg = getFPReg(MO0);
+ if (MI->killsRegister(X86::FP0+SrcReg)) {
+ // If the input operand is killed, we can just change the owner of the
+ // incoming stack slot into the result.
+ unsigned Slot = getSlot(SrcReg);
+ assert(Slot < 7 && DestReg < 7 && "FpMOV operands invalid!");
+ Stack[Slot] = DestReg;
+ RegMap[DestReg] = Slot;
+
+ } else {
+ // For FMOV we just duplicate the specified value to a new stack slot.
+ // This could be made better, but would require substantial changes.
+ duplicateToTop(SrcReg, DestReg, I);
+ }
+ }
+ break;
+ case TargetInstrInfo::INLINEASM: {
+ // The inline asm MachineInstr currently only *uses* FP registers for the
+ // 'f' constraint. These should be turned into the current ST(x) register
+ // in the machine instr. Also, any kills should be explicitly popped after
+ // the inline asm.
+ unsigned Kills[7];
+ unsigned NumKills = 0;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ assert(Op.isUse() && "Only handle inline asm uses right now");
+
+ unsigned FPReg = getFPReg(Op);
+ Op.setReg(getSTReg(FPReg));
+
+ // If we kill this operand, make sure to pop it from the stack after the
+ // asm. We just remember it for now, and pop them all off at the end in
+ // a batch.
+ if (Op.isKill())
+ Kills[NumKills++] = FPReg;
+ }
+
+ // If this asm kills any FP registers (is the last use of them) we must
+ // explicitly emit pop instructions for them. Do this now after the asm has
+ // executed so that the ST(x) numbers are not off (which would happen if we
+ // did this inline with operand rewriting).
+ //
+ // Note: this might be a non-optimal pop sequence. We might be able to do
+ // better by trying to pop in stack order or something.
+ MachineBasicBlock::iterator InsertPt = MI;
+ while (NumKills)
+ freeStackSlotAfter(InsertPt, Kills[--NumKills]);
+
+ // Don't delete the inline asm!
+ return;
+ }
+
+ case X86::RET:
+ case X86::RETI:
+ // If RET has an FP register use operand, pass the first one in ST(0) and
+ // the second one in ST(1).
+ if (isStackEmpty()) return; // Quick check to see if any are possible.
+
+ // Find the register operands.
+ unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ // FP Register uses must be kills unless there are two uses of the same
+ // register, in which case only one will be a kill.
+ assert(Op.isUse() &&
+ (Op.isKill() || // Marked kill.
+ getFPReg(Op) == FirstFPRegOp || // Second instance.
+ MI->killsRegister(Op.getReg())) && // Later use is marked kill.
+ "Ret only defs operands, and values aren't live beyond it");
+
+ if (FirstFPRegOp == ~0U)
+ FirstFPRegOp = getFPReg(Op);
+ else {
+ assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+ SecondFPRegOp = getFPReg(Op);
+ }
+
+ // Remove the operand so that later passes don't see it.
+ MI->RemoveOperand(i);
+ --i, --e;
+ }
+
+ // There are only four possibilities here:
+ // 1) we are returning a single FP value. In this case, it has to be in
+ // ST(0) already, so just declare success by removing the value from the
+ // FP Stack.
+ if (SecondFPRegOp == ~0U) {
+ // Assert that the top of stack contains the right FP register.
+ assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+ "Top of stack not the right register for RET!");
+
+ // Ok, everything is good, mark the value as not being on the stack
+ // anymore so that our assertion about the stack being empty at end of
+ // block doesn't fire.
+ StackTop = 0;
+ return;
+ }
+
+ // Otherwise, we are returning two values:
+ // 2) If returning the same value for both, we only have one thing in the FP
+ // stack. Consider: RET FP1, FP1
+ if (StackTop == 1) {
+ assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+ "Stack misconfiguration for RET!");
+
+ // Duplicate the TOS so that we return it twice. Just pick some other FPx
+ // register to hold it.
+ unsigned NewReg = (FirstFPRegOp+1)%7;
+ duplicateToTop(FirstFPRegOp, NewReg, MI);
+ FirstFPRegOp = NewReg;
+ }
+
+ /// Okay we know we have two different FPx operands now:
+ assert(StackTop == 2 && "Must have two values live!");
+
+ /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+ /// in ST(1). In this case, emit an fxch.
+ if (getStackEntry(0) == SecondFPRegOp) {
+ assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+ moveToTop(FirstFPRegOp, MI);
+ }
+
+ /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+ /// ST(1). Just remove both from our understanding of the stack and return.
+ assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+ assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+ StackTop = 0;
+ return;
+ }
+
+ I = MBB->erase(I); // Remove the pseudo instruction
+ --I;
+}
diff --git a/lib/Target/X86/X86FloatingPointRegKill.cpp b/lib/Target/X86/X86FloatingPointRegKill.cpp
new file mode 100644
index 0000000..009846e
--- /dev/null
+++ b/lib/Target/X86/X86FloatingPointRegKill.cpp
@@ -0,0 +1,139 @@
+//===-- X86FloatingPoint.cpp - FP_REG_KILL inserter -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts FP_REG_KILL instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/Instructions.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumFPKill, "Number of FP_REG_KILL instructions added");
+
+namespace {
+ struct VISIBILITY_HIDDEN FPRegKiller : public MachineFunctionPass {
+ static char ID;
+ FPRegKiller() : MachineFunctionPass(&ID) {}
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const { return "X86 FP_REG_KILL inserter"; }
+ };
+ char FPRegKiller::ID = 0;
+}
+
+FunctionPass *llvm::createX87FPRegKillInserterPass() { return new FPRegKiller(); }
+
+bool FPRegKiller::runOnMachineFunction(MachineFunction &MF) {
+ // If we are emitting FP stack code, scan the basic block to determine if this
+ // block defines any FP values. If so, put an FP_REG_KILL instruction before
+ // the terminator of the block.
+
+ // Note that FP stack instructions are used in all modes for long double,
+ // so we always need to do this check.
+ // Also note that it's possible for an FP stack register to be live across
+ // an instruction that produces multiple basic blocks (SSE CMOV) so we
+ // must check all the generated basic blocks.
+
+ // Scan all of the machine instructions in these MBBs, checking for FP
+ // stores. (RFP32 and RFP64 will not exist in SSE mode, but RFP80 might.)
+
+ // Fast-path: If nothing is using the x87 registers, we don't need to do
+ // any scanning.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (MRI.getRegClassVirtRegs(X86::RFP80RegisterClass).empty() &&
+ MRI.getRegClassVirtRegs(X86::RFP64RegisterClass).empty() &&
+ MRI.getRegClassVirtRegs(X86::RFP32RegisterClass).empty())
+ return false;
+
+ bool Changed = false;
+ const X86Subtarget &Subtarget = MF.getTarget().getSubtarget<X86Subtarget>();
+ MachineFunction::iterator MBBI = MF.begin();
+ MachineFunction::iterator EndMBB = MF.end();
+ for (; MBBI != EndMBB; ++MBBI) {
+ MachineBasicBlock *MBB = MBBI;
+
+ // If this block returns, ignore it. We don't want to insert an FP_REG_KILL
+ // before the return.
+ if (!MBB->empty()) {
+ MachineBasicBlock::iterator EndI = MBB->end();
+ --EndI;
+ if (EndI->getDesc().isReturn())
+ continue;
+ }
+
+ bool ContainsFPCode = false;
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ !ContainsFPCode && I != E; ++I) {
+ if (I->getNumOperands() != 0 && I->getOperand(0).isReg()) {
+ const TargetRegisterClass *clas;
+ for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+ if (I->getOperand(op).isReg() && I->getOperand(op).isDef() &&
+ TargetRegisterInfo::isVirtualRegister(I->getOperand(op).getReg()) &&
+ ((clas = MRI.getRegClass(I->getOperand(op).getReg())) ==
+ X86::RFP32RegisterClass ||
+ clas == X86::RFP64RegisterClass ||
+ clas == X86::RFP80RegisterClass)) {
+ ContainsFPCode = true;
+ break;
+ }
+ }
+ }
+ }
+ // Check PHI nodes in successor blocks. These PHI's will be lowered to have
+ // a copy of the input value in this block. In SSE mode, we only care about
+ // 80-bit values.
+ if (!ContainsFPCode) {
+ // Final check, check LLVM BB's that are successors to the LLVM BB
+ // corresponding to BB for FP PHI nodes.
+ const BasicBlock *LLVMBB = MBB->getBasicBlock();
+ const PHINode *PN;
+ for (succ_const_iterator SI = succ_begin(LLVMBB), E = succ_end(LLVMBB);
+ !ContainsFPCode && SI != E; ++SI) {
+ for (BasicBlock::const_iterator II = SI->begin();
+ (PN = dyn_cast<PHINode>(II)); ++II) {
+ if (PN->getType()==Type::X86_FP80Ty ||
+ (!Subtarget.hasSSE1() && PN->getType()->isFloatingPoint()) ||
+ (!Subtarget.hasSSE2() && PN->getType()==Type::DoubleTy)) {
+ ContainsFPCode = true;
+ break;
+ }
+ }
+ }
+ }
+ // Finally, if we found any FP code, emit the FP_REG_KILL instruction.
+ if (ContainsFPCode) {
+ BuildMI(*MBB, MBBI->getFirstTerminator(), DebugLoc::getUnknownLoc(),
+ MF.getTarget().getInstrInfo()->get(X86::FP_REG_KILL));
+ ++NumFPKill;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..bd1fea7
--- /dev/null
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,1716 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#include "llvm/Support/CommandLine.h"
+static cl::opt<bool> AvoidDupAddrCompute("x86-avoid-dup-address", cl::Hidden);
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+// Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+ /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+ /// SDValue's instead of register numbers for the leaves of the matched
+ /// tree.
+ struct X86ISelAddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ struct { // This is really a union, discriminated by BaseType!
+ SDValue Reg;
+ int FrameIndex;
+ } Base;
+
+ bool isRIPRel; // RIP as base?
+ unsigned Scale;
+ SDValue IndexReg;
+ int32_t Disp;
+ SDValue Segment;
+ GlobalValue *GV;
+ Constant *CP;
+ const char *ES;
+ int JT;
+ unsigned Align; // CP alignment.
+
+ X86ISelAddressMode()
+ : BaseType(RegBase), isRIPRel(false), Scale(1), IndexReg(), Disp(0),
+ Segment(), GV(0), CP(0), ES(0), JT(-1), Align(0) {
+ }
+
+ bool hasSymbolicDisplacement() const {
+ return GV != 0 || CP != 0 || ES != 0 || JT != -1;
+ }
+
+ void dump() {
+ cerr << "X86ISelAddressMode " << this << "\n";
+ cerr << "Base.Reg ";
+ if (Base.Reg.getNode() != 0) Base.Reg.getNode()->dump();
+ else cerr << "nul";
+ cerr << " Base.FrameIndex " << Base.FrameIndex << "\n";
+ cerr << "isRIPRel " << isRIPRel << " Scale" << Scale << "\n";
+ cerr << "IndexReg ";
+ if (IndexReg.getNode() != 0) IndexReg.getNode()->dump();
+ else cerr << "nul";
+ cerr << " Disp " << Disp << "\n";
+ cerr << "GV "; if (GV) GV->dump();
+ else cerr << "nul";
+ cerr << " CP "; if (CP) CP->dump();
+ else cerr << "nul";
+ cerr << "\n";
+ cerr << "ES "; if (ES) cerr << ES; else cerr << "nul";
+ cerr << " JT" << JT << " Align" << Align << "\n";
+ }
+ };
+}
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ /// ISel - X86 specific code to select X86 machine instructions for
+ /// SelectionDAG operations.
+ ///
+ class VISIBILITY_HIDDEN X86DAGToDAGISel : public SelectionDAGISel {
+ /// TM - Keep a reference to X86TargetMachine.
+ ///
+ X86TargetMachine &TM;
+
+ /// X86Lowering - This object fully describes how to lower LLVM code to an
+ /// X86-specific SelectionDAG.
+ X86TargetLowering &X86Lowering;
+
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// CurBB - Current BB being isel'd.
+ ///
+ MachineBasicBlock *CurBB;
+
+ /// OptForSize - If true, selector should try to optimize for code size
+ /// instead of performance.
+ bool OptForSize;
+
+ public:
+ explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel),
+ TM(tm), X86Lowering(*TM.getTargetLowering()),
+ Subtarget(&TM.getSubtarget<X86Subtarget>()),
+ OptForSize(false) {}
+
+ virtual const char *getPassName() const {
+ return "X86 DAG->DAG Instruction Selection";
+ }
+
+ /// InstructionSelect - This callback is invoked by
+ /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+ virtual void InstructionSelect();
+
+ virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);
+
+ virtual
+ bool IsLegalAndProfitableToFold(SDNode *N, SDNode *U, SDNode *Root) const;
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+ private:
+ SDNode *Select(SDValue N);
+ SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+
+ bool MatchSegmentBaseAddress(SDValue N, X86ISelAddressMode &AM);
+ bool MatchLoad(SDValue N, X86ISelAddressMode &AM);
+ bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
+ bool MatchAddress(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth = 0);
+ bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
+ bool SelectAddr(SDValue Op, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool SelectLEAAddr(SDValue Op, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp);
+ bool SelectScalarSSELoad(SDValue Op, SDValue Pred,
+ SDValue N, SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment,
+ SDValue &InChain, SDValue &OutChain);
+ bool TryFoldLoad(SDValue P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ void PreprocessForRMW();
+ void PreprocessForFPConvert();
+
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions.
+ virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ char ConstraintCode,
+ std::vector<SDValue> &OutOps);
+
+ void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+ inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+ CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+ AM.Base.Reg;
+ Scale = getI8Imm(AM.Scale);
+ Index = AM.IndexReg;
+ // These are 32-bit even in 64-bit mode since RIP relative offset
+ // is 32-bit.
+ if (AM.GV)
+ Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp);
+ else if (AM.CP)
+ Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+ AM.Align, AM.Disp);
+ else if (AM.ES)
+ Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32);
+ else if (AM.JT != -1)
+ Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32);
+ else
+ Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
+
+ if (AM.Segment.getNode())
+ Segment = AM.Segment;
+ else
+ Segment = CurDAG->getRegister(0, MVT::i32);
+ }
+
+ /// getI8Imm - Return a target constant with the specified value, of type
+ /// i8.
+ inline SDValue getI8Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i8);
+ }
+
+ /// getI16Imm - Return a target constant with the specified value, of type
+ /// i16.
+ inline SDValue getI16Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i16);
+ }
+
+ /// getI32Imm - Return a target constant with the specified value, of type
+ /// i32.
+ inline SDValue getI32Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i32);
+ }
+
+ /// getGlobalBaseReg - Return an SDNode that returns the value of
+ /// the global base register. Output instructions required to
+ /// initialize the global base register, if necessary.
+ ///
+ SDNode *getGlobalBaseReg();
+
+#ifndef NDEBUG
+ unsigned Indent;
+#endif
+ };
+}
+
+
+bool X86DAGToDAGISel::IsLegalAndProfitableToFold(SDNode *N, SDNode *U,
+ SDNode *Root) const {
+ if (OptLevel == CodeGenOpt::None) return false;
+
+ if (U == Root)
+ switch (U->getOpcode()) {
+ default: break;
+ case ISD::ADD:
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue Op1 = U->getOperand(1);
+
+ // If the other operand is a 8-bit immediate we should fold the immediate
+ // instead. This reduces code size.
+ // e.g.
+ // movl 4(%esp), %eax
+ // addl $4, %eax
+ // vs.
+ // movl $4, %eax
+ // addl 4(%esp), %eax
+ // The former is 2 bytes shorter. In case where the increment is 1, then
+ // the saving can be 4 bytes (by using incl %eax).
+ if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+ if (Imm->getAPIntValue().isSignedIntN(8))
+ return false;
+
+ // If the other operand is a TLS address, we should fold it instead.
+ // This produces
+ // movl %gs:0, %eax
+ // leal i@NTPOFF(%eax), %eax
+ // instead of
+ // movl $i@NTPOFF, %eax
+ // addl %gs:0, %eax
+ // if the block also has an access to a second TLS address this will save
+ // a load.
+ // FIXME: This is probably also true for non TLS addresses.
+ if (Op1.getOpcode() == X86ISD::Wrapper) {
+ SDValue Val = Op1.getOperand(0);
+ if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+ }
+ }
+ }
+
+ // Proceed to 'generic' cycle finder code
+ return SelectionDAGISel::IsLegalAndProfitableToFold(N, U, Root);
+}
+
+/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result.
+static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
+ SDValue Store, SDValue TF) {
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
+ if (Load.getNode() == TF.getOperand(i).getNode())
+ Ops.push_back(Load.getOperand(0));
+ else
+ Ops.push_back(TF.getOperand(i));
+ CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
+ CurDAG->UpdateNodeOperands(Load, TF, Load.getOperand(1), Load.getOperand(2));
+ CurDAG->UpdateNodeOperands(Store, Load.getValue(1), Store.getOperand(1),
+ Store.getOperand(2), Store.getOperand(3));
+}
+
+/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.
+///
+static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
+ SDValue &Load) {
+ if (N.getOpcode() == ISD::BIT_CONVERT)
+ N = N.getOperand(0);
+
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+ if (!LD || LD->isVolatile())
+ return false;
+ if (LD->getAddressingMode() != ISD::UNINDEXED)
+ return false;
+
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
+ return false;
+
+ if (N.hasOneUse() &&
+ N.getOperand(1) == Address &&
+ N.getNode()->isOperandOf(Chain.getNode())) {
+ Load = N;
+ return true;
+ }
+ return false;
+}
+
+/// MoveBelowCallSeqStart - Replace CALLSEQ_START operand with load's chain
+/// operand and move load below the call's chain operand.
+static void MoveBelowCallSeqStart(SelectionDAG *CurDAG, SDValue Load,
+ SDValue Call, SDValue CallSeqStart) {
+ SmallVector<SDValue, 8> Ops;
+ SDValue Chain = CallSeqStart.getOperand(0);
+ if (Chain.getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else {
+ assert(Chain.getOpcode() == ISD::TokenFactor &&
+ "Unexpected CallSeqStart chain operand");
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+ if (Chain.getOperand(i).getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else
+ Ops.push_back(Chain.getOperand(i));
+ SDValue NewChain =
+ CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
+ MVT::Other, &Ops[0], Ops.size());
+ Ops.clear();
+ Ops.push_back(NewChain);
+ }
+ for (unsigned i = 1, e = CallSeqStart.getNumOperands(); i != e; ++i)
+ Ops.push_back(CallSeqStart.getOperand(i));
+ CurDAG->UpdateNodeOperands(CallSeqStart, &Ops[0], Ops.size());
+ CurDAG->UpdateNodeOperands(Load, Call.getOperand(0),
+ Load.getOperand(1), Load.getOperand(2));
+ Ops.clear();
+ Ops.push_back(SDValue(Load.getNode(), 1));
+ for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i)
+ Ops.push_back(Call.getOperand(i));
+ CurDAG->UpdateNodeOperands(Call, &Ops[0], Ops.size());
+}
+
+/// isCalleeLoad - Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain) {
+ if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+ return false;
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+ if (!LD ||
+ LD->isVolatile() ||
+ LD->getAddressingMode() != ISD::UNINDEXED ||
+ LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+
+ // Now let's find the callseq_start.
+ while (Chain.getOpcode() != ISD::CALLSEQ_START) {
+ if (!Chain.hasOneUse())
+ return false;
+ Chain = Chain.getOperand(0);
+ }
+
+ if (Chain.getOperand(0).getNode() == Callee.getNode())
+ return true;
+ if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+ Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()))
+ return true;
+ return false;
+}
+
+
+/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
+/// This is only run if not in -O0 mode.
+/// This allows the instruction selector to pick more read-modify-write
+/// instructions. This is a common case:
+///
+/// [Load chain]
+/// ^
+/// |
+/// [Load]
+/// ^ ^
+/// | |
+/// / \-
+/// / |
+/// [TokenFactor] [Op]
+/// ^ ^
+/// | |
+/// \ /
+/// \ /
+/// [Store]
+///
+/// The fact the store's chain operand != load's chain will prevent the
+/// (store (op (load))) instruction from being selected. We can transform it to:
+///
+/// [Load chain]
+/// ^
+/// |
+/// [TokenFactor]
+/// ^
+/// |
+/// [Load]
+/// ^ ^
+/// | |
+/// | \-
+/// | |
+/// | [Op]
+/// | ^
+/// | |
+/// \ /
+/// \ /
+/// [Store]
+void X86DAGToDAGISel::PreprocessForRMW() {
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end(); I != E; ++I) {
+ if (I->getOpcode() == X86ISD::CALL) {
+ /// Also try moving call address load from outside callseq_start to just
+ /// before the call to allow it to be folded.
+ ///
+ /// [Load chain]
+ /// ^
+ /// |
+ /// [Load]
+ /// ^ ^
+ /// | |
+ /// / \--
+ /// / |
+ ///[CALLSEQ_START] |
+ /// ^ |
+ /// | |
+ /// [LOAD/C2Reg] |
+ /// | |
+ /// \ /
+ /// \ /
+ /// [CALL]
+ SDValue Chain = I->getOperand(0);
+ SDValue Load = I->getOperand(1);
+ if (!isCalleeLoad(Load, Chain))
+ continue;
+ MoveBelowCallSeqStart(CurDAG, Load, SDValue(I, 0), Chain);
+ ++NumLoadMoved;
+ continue;
+ }
+
+ if (!ISD::isNON_TRUNCStore(I))
+ continue;
+ SDValue Chain = I->getOperand(0);
+
+ if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
+ continue;
+
+ SDValue N1 = I->getOperand(1);
+ SDValue N2 = I->getOperand(2);
+ if ((N1.getValueType().isFloatingPoint() &&
+ !N1.getValueType().isVector()) ||
+ !N1.hasOneUse())
+ continue;
+
+ bool RModW = false;
+ SDValue Load;
+ unsigned Opcode = N1.getNode()->getOpcode();
+ switch (Opcode) {
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::VECTOR_SHUFFLE: {
+ SDValue N10 = N1.getOperand(0);
+ SDValue N11 = N1.getOperand(1);
+ RModW = isRMWLoad(N10, Chain, N2, Load);
+ if (!RModW)
+ RModW = isRMWLoad(N11, Chain, N2, Load);
+ break;
+ }
+ case ISD::SUB:
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::ROTL:
+ case ISD::ROTR:
+ case ISD::SUBC:
+ case ISD::SUBE:
+ case X86ISD::SHLD:
+ case X86ISD::SHRD: {
+ SDValue N10 = N1.getOperand(0);
+ RModW = isRMWLoad(N10, Chain, N2, Load);
+ break;
+ }
+ }
+
+ if (RModW) {
+ MoveBelowTokenFactor(CurDAG, Load, SDValue(I, 0), Chain);
+ ++NumLoadMoved;
+ }
+ }
+}
+
+
+/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend
+/// nodes that target the FP stack to be store and load to the stack. This is a
+/// gross hack. We would like to simply mark these as being illegal, but when
+/// we do that, legalize produces these when it expands calls, then expands
+/// these in the same legalize pass. We would like dag combine to be able to
+/// hack on these between the call expansion and the node legalization. As such
+/// this pass basically does "really late" legalization of these inline with the
+/// X86 isel pass.
+void X86DAGToDAGISel::PreprocessForFPConvert() {
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end(); I != E; ) {
+ SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
+ if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+ continue;
+
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ MVT SrcVT = N->getOperand(0).getValueType();
+ MVT DstVT = N->getValueType(0);
+ bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
+ continue;
+
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(1))
+ continue;
+ }
+
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ MVT MemVT;
+ if (N->getOpcode() == ISD::FP_ROUND)
+ MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+ else
+ MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ DebugLoc dl = N->getDebugLoc();
+
+ // FIXME: optimize the case where the src/dest is a load or store?
+ SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
+ N->getOperand(0),
+ MemTmp, NULL, 0, MemVT);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+ NULL, 0, MemVT);
+
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+ // Now that we did that, the node is dead. Increment the iterator to the
+ // next node to process, then delete N.
+ ++I;
+ CurDAG->DeleteNode(N);
+ }
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
+/// when it has created a SelectionDAG for us to codegen.
+void X86DAGToDAGISel::InstructionSelect() {
+ CurBB = BB; // BB can change as result of isel.
+ const Function *F = CurDAG->getMachineFunction().getFunction();
+ OptForSize = F->hasFnAttr(Attribute::OptimizeForSize);
+
+ DEBUG(BB->dump());
+ if (OptLevel != CodeGenOpt::None)
+ PreprocessForRMW();
+
+ // FIXME: This should only happen when not compiled with -O0.
+ PreprocessForFPConvert();
+
+ // Codegen the basic block.
+#ifndef NDEBUG
+ DOUT << "===== Instruction selection begins:\n";
+ Indent = 0;
+#endif
+ SelectRoot(*CurDAG);
+#ifndef NDEBUG
+ DOUT << "===== Instruction selection ends:\n";
+#endif
+
+ CurDAG->RemoveDeadNodes();
+}
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+ MachineFrameInfo *MFI) {
+ const TargetInstrInfo *TII = TM.getInstrInfo();
+ if (Subtarget->isTargetCygMing())
+ BuildMI(BB, DebugLoc::getUnknownLoc(),
+ TII->get(X86::CALLpcrel32)).addExternalSymbol("__main");
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
+ // If this is main, emit special code for main.
+ MachineBasicBlock *BB = MF.begin();
+ if (Fn.hasExternalLinkage() && Fn.getName() == "main")
+ EmitSpecialCodeForMain(BB, MF.getFrameInfo());
+}
+
+
+bool X86DAGToDAGISel::MatchSegmentBaseAddress(SDValue N,
+ X86ISelAddressMode &AM) {
+ assert(N.getOpcode() == X86ISD::SegmentBaseAddress);
+ SDValue Segment = N.getOperand(0);
+
+ if (AM.Segment.getNode() == 0) {
+ AM.Segment = Segment;
+ return false;
+ }
+
+ return true;
+}
+
+bool X86DAGToDAGISel::MatchLoad(SDValue N, X86ISelAddressMode &AM) {
+ // This optimization is valid because the GNU TLS model defines that
+ // gs:0 (or fs:0 on X86-64) contains its own address.
+ // For more information see http://people.redhat.com/drepper/tls.pdf
+
+ SDValue Address = N.getOperand(1);
+ if (Address.getOpcode() == X86ISD::SegmentBaseAddress &&
+ !MatchSegmentBaseAddress (Address, AM))
+ return false;
+
+ return true;
+}
+
+bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
+ bool is64Bit = Subtarget->is64Bit();
+ DOUT << "Wrapper: 64bit " << is64Bit;
+ DOUT << " AM "; DEBUG(AM.dump()); DOUT << "\n";
+
+ // Under X86-64 non-small code model, GV (and friends) are 64-bits.
+ if (is64Bit && (TM.getCodeModel() != CodeModel::Small))
+ return true;
+
+ // Base and index reg must be 0 in order to use rip as base.
+ bool canUsePICRel = !AM.Base.Reg.getNode() && !AM.IndexReg.getNode();
+ if (is64Bit && !canUsePICRel && TM.symbolicAddressesAreRIPRel())
+ return true;
+
+ if (AM.hasSymbolicDisplacement())
+ return true;
+ // If value is available in a register both base and index components have
+ // been picked, we can't fit the result available in the register in the
+ // addressing mode. Duplicate GlobalAddress or ConstantPool as displacement.
+
+ SDValue N0 = N.getOperand(0);
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ uint64_t Offset = G->getOffset();
+ if (!is64Bit || isInt32(AM.Disp + Offset)) {
+ GlobalValue *GV = G->getGlobal();
+ bool isRIPRel = TM.symbolicAddressesAreRIPRel();
+ if (N0.getOpcode() == llvm::ISD::TargetGlobalTLSAddress) {
+ TLSModel::Model model =
+ getTLSModel (GV, TM.getRelocationModel());
+ if (is64Bit && model == TLSModel::InitialExec)
+ isRIPRel = true;
+ }
+ AM.GV = GV;
+ AM.Disp += Offset;
+ AM.isRIPRel = isRIPRel;
+ return false;
+ }
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ uint64_t Offset = CP->getOffset();
+ if (!is64Bit || isInt32(AM.Disp + Offset)) {
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.Disp += Offset;
+ AM.isRIPRel = TM.symbolicAddressesAreRIPRel();
+ return false;
+ }
+ } else if (ExternalSymbolSDNode *S =dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.isRIPRel = TM.symbolicAddressesAreRIPRel();
+ return false;
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.isRIPRel = TM.symbolicAddressesAreRIPRel();
+ return false;
+ }
+
+ return true;
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done. This just pattern matches for the
+/// addressing mode.
+bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
+ unsigned Depth) {
+ bool is64Bit = Subtarget->is64Bit();
+ DebugLoc dl = N.getDebugLoc();
+ DOUT << "MatchAddress: "; DEBUG(AM.dump());
+ // Limit recursion.
+ if (Depth > 5)
+ return MatchAddressBase(N, AM);
+
+ // RIP relative addressing: %rip + 32-bit displacement!
+ if (AM.isRIPRel) {
+ if (!AM.ES && AM.JT != -1 && N.getOpcode() == ISD::Constant) {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!is64Bit || isInt32(AM.Disp + Val)) {
+ AM.Disp += Val;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ switch (N.getOpcode()) {
+ default: break;
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!is64Bit || isInt32(AM.Disp + Val)) {
+ AM.Disp += Val;
+ return false;
+ }
+ break;
+ }
+
+ case X86ISD::SegmentBaseAddress:
+ if (!MatchSegmentBaseAddress(N, AM))
+ return false;
+ break;
+
+ case X86ISD::Wrapper:
+ if (!MatchWrapper(N, AM))
+ return false;
+ break;
+
+ case ISD::LOAD:
+ if (!MatchLoad(N, AM))
+ return false;
+ break;
+
+ case ISD::FrameIndex:
+ if (AM.BaseType == X86ISelAddressMode::RegBase
+ && AM.Base.Reg.getNode() == 0) {
+ AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+ return false;
+ }
+ break;
+
+ case ISD::SHL:
+ if (AM.IndexReg.getNode() != 0 || AM.Scale != 1 || AM.isRIPRel)
+ break;
+
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+ unsigned Val = CN->getZExtValue();
+ if (Val == 1 || Val == 2 || Val == 3) {
+ AM.Scale = 1 << Val;
+ SDValue ShVal = N.getNode()->getOperand(0);
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (ShVal.getNode()->getOpcode() == ISD::ADD && ShVal.hasOneUse() &&
+ isa<ConstantSDNode>(ShVal.getNode()->getOperand(1))) {
+ AM.IndexReg = ShVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+ uint64_t Disp = AM.Disp + (AddVal->getSExtValue() << Val);
+ if (!is64Bit || isInt32(Disp))
+ AM.Disp = Disp;
+ else
+ AM.IndexReg = ShVal;
+ } else {
+ AM.IndexReg = ShVal;
+ }
+ return false;
+ }
+ break;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI:
+ // A mul_lohi where we need the low part can be folded as a plain multiply.
+ if (N.getResNo() != 0) break;
+ // FALL THROUGH
+ case ISD::MUL:
+ case X86ISD::MUL_IMM:
+ // X*[3,5,9] -> X+X*[2,4,8]
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base.Reg.getNode() == 0 &&
+ AM.IndexReg.getNode() == 0 &&
+ !AM.isRIPRel) {
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+ if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+ CN->getZExtValue() == 9) {
+ AM.Scale = unsigned(CN->getZExtValue())-1;
+
+ SDValue MulVal = N.getNode()->getOperand(0);
+ SDValue Reg;
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+ isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+ Reg = MulVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+ uint64_t Disp = AM.Disp + AddVal->getSExtValue() *
+ CN->getZExtValue();
+ if (!is64Bit || isInt32(Disp))
+ AM.Disp = Disp;
+ else
+ Reg = N.getNode()->getOperand(0);
+ } else {
+ Reg = N.getNode()->getOperand(0);
+ }
+
+ AM.IndexReg = AM.Base.Reg = Reg;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SUB: {
+ // Given A-B, if A can be completely folded into the address and
+ // the index field with the index field unused, use -B as the index.
+ // This is a win if a has multiple parts that can be folded into
+ // the address. Also, this saves a mov if the base register has
+ // other uses, since it avoids a two-address sub instruction, however
+ // it costs an additional mov if the index register has other uses.
+
+ // Test if the LHS of the sub can be folded.
+ X86ISelAddressMode Backup = AM;
+ if (MatchAddress(N.getNode()->getOperand(0), AM, Depth+1)) {
+ AM = Backup;
+ break;
+ }
+ // Test if the index field is free for use.
+ if (AM.IndexReg.getNode() || AM.isRIPRel) {
+ AM = Backup;
+ break;
+ }
+ int Cost = 0;
+ SDValue RHS = N.getNode()->getOperand(1);
+ // If the RHS involves a register with multiple uses, this
+ // transformation incurs an extra mov, due to the neg instruction
+ // clobbering its operand.
+ if (!RHS.getNode()->hasOneUse() ||
+ RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+ RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+ RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+ (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+ ++Cost;
+ // If the base is a register with multiple uses, this
+ // transformation may save a mov.
+ if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base.Reg.getNode() &&
+ !AM.Base.Reg.getNode()->hasOneUse()) ||
+ AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ --Cost;
+ // If the folded LHS was interesting, this transformation saves
+ // address arithmetic.
+ if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+ ((AM.Disp != 0) && (Backup.Disp == 0)) +
+ (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+ --Cost;
+ // If it doesn't look like it may be an overall win, don't do it.
+ if (Cost >= 0) {
+ AM = Backup;
+ break;
+ }
+
+ // Ok, the transformation is legal and appears profitable. Go for it.
+ SDValue Zero = CurDAG->getConstant(0, N.getValueType());
+ SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+ AM.IndexReg = Neg;
+ AM.Scale = 1;
+
+ // Insert the new nodes into the topological ordering.
+ if (Zero.getNode()->getNodeId() == -1 ||
+ Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), Zero.getNode());
+ Zero.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ if (Neg.getNode()->getNodeId() == -1 ||
+ Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), Neg.getNode());
+ Neg.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ return false;
+ }
+
+ case ISD::ADD: {
+ X86ISelAddressMode Backup = AM;
+ if (!MatchAddress(N.getNode()->getOperand(0), AM, Depth+1) &&
+ !MatchAddress(N.getNode()->getOperand(1), AM, Depth+1))
+ return false;
+ AM = Backup;
+ if (!MatchAddress(N.getNode()->getOperand(1), AM, Depth+1) &&
+ !MatchAddress(N.getNode()->getOperand(0), AM, Depth+1))
+ return false;
+ AM = Backup;
+
+ // If we couldn't fold both operands into the address at the same time,
+ // see if we can just put each operand into a register and fold at least
+ // the add.
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ !AM.Base.Reg.getNode() &&
+ !AM.IndexReg.getNode() &&
+ !AM.isRIPRel) {
+ AM.Base.Reg = N.getNode()->getOperand(0);
+ AM.IndexReg = N.getNode()->getOperand(1);
+ AM.Scale = 1;
+ return false;
+ }
+ break;
+ }
+
+ case ISD::OR:
+ // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ X86ISelAddressMode Backup = AM;
+ uint64_t Offset = CN->getSExtValue();
+ // Start with the LHS as an addr mode.
+ if (!MatchAddress(N.getOperand(0), AM, Depth+1) &&
+ // Address could not have picked a GV address for the displacement.
+ AM.GV == NULL &&
+ // On x86-64, the resultant disp must fit in 32-bits.
+ (!is64Bit || isInt32(AM.Disp + Offset)) &&
+ // Check to see if the LHS & C is zero.
+ CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+ AM.Disp += Offset;
+ return false;
+ }
+ AM = Backup;
+ }
+ break;
+
+ case ISD::AND: {
+ // Perform some heroic transforms on an and of a constant-count shift
+ // with a constant to enable use of the scaled offset field.
+
+ SDValue Shift = N.getOperand(0);
+ if (Shift.getNumOperands() != 2) break;
+
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+
+ // Not when RIP is used as the base.
+ if (AM.isRIPRel) break;
+
+ SDValue X = Shift.getOperand(0);
+ ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ if (!C1 || !C2) break;
+
+ // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This
+ // allows us to convert the shift and and into an h-register extract and
+ // a scaled index.
+ if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) {
+ unsigned ScaleLog = 8 - C1->getZExtValue();
+ if (ScaleLog > 0 && ScaleLog < 4 &&
+ C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) {
+ SDValue Eight = CurDAG->getConstant(8, MVT::i8);
+ SDValue Mask = CurDAG->getConstant(0xff, N.getValueType());
+ SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+ X, Eight);
+ SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(),
+ Srl, Mask);
+ SDValue ShlCount = CurDAG->getConstant(ScaleLog, MVT::i8);
+ SDValue Shl = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
+ And, ShlCount);
+
+ // Insert the new nodes into the topological ordering.
+ if (Eight.getNode()->getNodeId() == -1 ||
+ Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), Eight.getNode());
+ Eight.getNode()->setNodeId(X.getNode()->getNodeId());
+ }
+ if (Mask.getNode()->getNodeId() == -1 ||
+ Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), Mask.getNode());
+ Mask.getNode()->setNodeId(X.getNode()->getNodeId());
+ }
+ if (Srl.getNode()->getNodeId() == -1 ||
+ Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(Shift.getNode(), Srl.getNode());
+ Srl.getNode()->setNodeId(Shift.getNode()->getNodeId());
+ }
+ if (And.getNode()->getNodeId() == -1 ||
+ And.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), And.getNode());
+ And.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ if (ShlCount.getNode()->getNodeId() == -1 ||
+ ShlCount.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), ShlCount.getNode());
+ ShlCount.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ if (Shl.getNode()->getNodeId() == -1 ||
+ Shl.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), Shl.getNode());
+ Shl.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ CurDAG->ReplaceAllUsesWith(N, Shl);
+ AM.IndexReg = And;
+ AM.Scale = (1 << ScaleLog);
+ return false;
+ }
+ }
+
+ // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
+ // allows us to fold the shift into this addressing mode.
+ if (Shift.getOpcode() != ISD::SHL) break;
+
+ // Not likely to be profitable if either the AND or SHIFT node has more
+ // than one use (unless all uses are for address computation). Besides,
+ // isel mechanism requires their node ids to be reused.
+ if (!N.hasOneUse() || !Shift.hasOneUse())
+ break;
+
+ // Verify that the shift amount is something we can fold.
+ unsigned ShiftCst = C1->getZExtValue();
+ if (ShiftCst != 1 && ShiftCst != 2 && ShiftCst != 3)
+ break;
+
+ // Get the new AND mask, this folds to a constant.
+ SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+ SDValue(C2, 0), SDValue(C1, 0));
+ SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X,
+ NewANDMask);
+ SDValue NewSHIFT = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
+ NewAND, SDValue(C1, 0));
+
+ // Insert the new nodes into the topological ordering.
+ if (C1->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), C1);
+ C1->setNodeId(X.getNode()->getNodeId());
+ }
+ if (NewANDMask.getNode()->getNodeId() == -1 ||
+ NewANDMask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), NewANDMask.getNode());
+ NewANDMask.getNode()->setNodeId(X.getNode()->getNodeId());
+ }
+ if (NewAND.getNode()->getNodeId() == -1 ||
+ NewAND.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(Shift.getNode(), NewAND.getNode());
+ NewAND.getNode()->setNodeId(Shift.getNode()->getNodeId());
+ }
+ if (NewSHIFT.getNode()->getNodeId() == -1 ||
+ NewSHIFT.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), NewSHIFT.getNode());
+ NewSHIFT.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+
+ CurDAG->ReplaceAllUsesWith(N, NewSHIFT);
+
+ AM.Scale = 1 << ShiftCst;
+ AM.IndexReg = NewAND;
+ return false;
+ }
+ }
+
+ return MatchAddressBase(N, AM);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+ // Is the base register already occupied?
+ if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base.Reg.getNode()) {
+ // If so, check to see if the scale index register is set.
+ if (AM.IndexReg.getNode() == 0 && !AM.isRIPRel) {
+ AM.IndexReg = N;
+ AM.Scale = 1;
+ return false;
+ }
+
+ // Otherwise, we cannot select it.
+ return true;
+ }
+
+ // Default, generate it as a register.
+ AM.BaseType = X86ISelAddressMode::RegBase;
+ AM.Base.Reg = N;
+ return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool X86DAGToDAGISel::SelectAddr(SDValue Op, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ X86ISelAddressMode AM;
+ bool Done = false;
+ if (AvoidDupAddrCompute && !N.hasOneUse()) {
+ unsigned Opcode = N.getOpcode();
+ if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex &&
+ Opcode != X86ISD::Wrapper) {
+ // If we are able to fold N into addressing mode, then we'll allow it even
+ // if N has multiple uses. In general, addressing computation is used as
+ // addresses by all of its uses. But watch out for CopyToReg uses, that
+ // means the address computation is liveout. It will be computed by a LEA
+ // so we want to avoid computing the address twice.
+ for (SDNode::use_iterator UI = N.getNode()->use_begin(),
+ UE = N.getNode()->use_end(); UI != UE; ++UI) {
+ if (UI->getOpcode() == ISD::CopyToReg) {
+ MatchAddressBase(N, AM);
+ Done = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!Done && MatchAddress(N, AM))
+ return false;
+
+ MVT VT = N.getValueType();
+ if (AM.BaseType == X86ISelAddressMode::RegBase) {
+ if (!AM.Base.Reg.getNode())
+ AM.Base.Reg = CurDAG->getRegister(0, VT);
+ }
+
+ if (!AM.IndexReg.getNode())
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to
+/// match a load whose top elements are either undef or zeros. The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDValue Op, SDValue Pred,
+ SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment,
+ SDValue &InChain,
+ SDValue &OutChain) {
+ if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ InChain = N.getOperand(0).getValue(1);
+ if (ISD::isNON_EXTLoad(InChain.getNode()) &&
+ InChain.getValue(0).hasOneUse() &&
+ N.hasOneUse() &&
+ IsLegalAndProfitableToFold(N.getNode(), Pred.getNode(), Op.getNode())) {
+ LoadSDNode *LD = cast<LoadSDNode>(InChain);
+ if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ return false;
+ OutChain = LD->getChain();
+ return true;
+ }
+ }
+
+ // Also handle the case where we explicitly require zeros in the top
+ // elements. This is a vector shuffle from the zero vector.
+ if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+ // Check to see if the top elements are all zeros (or bitcast of zeros).
+ N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ N.getOperand(0).getNode()->hasOneUse() &&
+ ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
+ N.getOperand(0).getOperand(0).hasOneUse()) {
+ // Okay, this is a zero extending load. Fold it.
+ LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+ if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ return false;
+ OutChain = LD->getChain();
+ InChain = SDValue(LD, 1);
+ return true;
+ }
+ return false;
+}
+
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDValue Op, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp) {
+ X86ISelAddressMode AM;
+
+ // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+ // segments.
+ SDValue Copy = AM.Segment;
+ SDValue T = CurDAG->getRegister(0, MVT::i32);
+ AM.Segment = T;
+ if (MatchAddress(N, AM))
+ return false;
+ assert (T == AM.Segment);
+ AM.Segment = Copy;
+
+ MVT VT = N.getValueType();
+ unsigned Complexity = 0;
+ if (AM.BaseType == X86ISelAddressMode::RegBase)
+ if (AM.Base.Reg.getNode())
+ Complexity = 1;
+ else
+ AM.Base.Reg = CurDAG->getRegister(0, VT);
+ else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Complexity = 4;
+
+ if (AM.IndexReg.getNode())
+ Complexity++;
+ else
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+ // a simple shift.
+ if (AM.Scale > 1)
+ Complexity++;
+
+ // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+ // to a LEA. This is determined with some expermentation but is by no means
+ // optimal (especially for code size consideration). LEA is nice because of
+ // its three-address nature. Tweak the cost function again when we can run
+ // convertToThreeAddress() at register allocation time.
+ if (AM.hasSymbolicDisplacement()) {
+ // For X86-64, we should always use lea to materialize RIP relative
+ // addresses.
+ if (Subtarget->is64Bit())
+ Complexity = 4;
+ else
+ Complexity += 2;
+ }
+
+ if (AM.Disp && (AM.Base.Reg.getNode() || AM.IndexReg.getNode()))
+ Complexity++;
+
+ if (Complexity > 2) {
+ SDValue Segment;
+ getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+ return true;
+ }
+ return false;
+}
+
+bool X86DAGToDAGISel::TryFoldLoad(SDValue P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ if (ISD::isNON_EXTLoad(N.getNode()) &&
+ N.hasOneUse() &&
+ IsLegalAndProfitableToFold(N.getNode(), P.getNode(), P.getNode()))
+ return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp, Segment);
+ return false;
+}
+
+/// getGlobalBaseReg - Return an SDNode that returns the value of
+/// the global base register. Output instructions required to
+/// initialize the global base register, if necessary.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+ MachineFunction *MF = CurBB->getParent();
+ unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
+ return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+static SDNode *FindCallStartFromCall(SDNode *Node) {
+ if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
+ assert(Node->getOperand(0).getValueType() == MVT::Other &&
+ "Node doesn't have a token chain argument!");
+ return FindCallStartFromCall(Node->getOperand(0).getNode());
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+ SDValue Chain = Node->getOperand(0);
+ SDValue In1 = Node->getOperand(1);
+ SDValue In2L = Node->getOperand(2);
+ SDValue In2H = Node->getOperand(3);
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (!SelectAddr(In1, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+ return NULL;
+ SDValue LSI = Node->getOperand(4); // MemOperand
+ const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, LSI, Chain};
+ return CurDAG->getTargetNode(Opc, Node->getDebugLoc(),
+ MVT::i32, MVT::i32, MVT::Other, Ops,
+ array_lengthof(Ops));
+}
+
+SDNode *X86DAGToDAGISel::Select(SDValue N) {
+ SDNode *Node = N.getNode();
+ MVT NVT = Node->getValueType(0);
+ unsigned Opc, MOpc;
+ unsigned Opcode = Node->getOpcode();
+ DebugLoc dl = Node->getDebugLoc();
+
+#ifndef NDEBUG
+ DOUT << std::string(Indent, ' ') << "Selecting: ";
+ DEBUG(Node->dump(CurDAG));
+ DOUT << "\n";
+ Indent += 2;
+#endif
+
+ if (Node->isMachineOpcode()) {
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "== ";
+ DEBUG(Node->dump(CurDAG));
+ DOUT << "\n";
+ Indent -= 2;
+#endif
+ return NULL; // Already selected.
+ }
+
+ switch (Opcode) {
+ default: break;
+ case X86ISD::GlobalBaseReg:
+ return getGlobalBaseReg();
+
+ case X86ISD::ATOMOR64_DAG:
+ return SelectAtomic64(Node, X86::ATOMOR6432);
+ case X86ISD::ATOMXOR64_DAG:
+ return SelectAtomic64(Node, X86::ATOMXOR6432);
+ case X86ISD::ATOMADD64_DAG:
+ return SelectAtomic64(Node, X86::ATOMADD6432);
+ case X86ISD::ATOMSUB64_DAG:
+ return SelectAtomic64(Node, X86::ATOMSUB6432);
+ case X86ISD::ATOMNAND64_DAG:
+ return SelectAtomic64(Node, X86::ATOMNAND6432);
+ case X86ISD::ATOMAND64_DAG:
+ return SelectAtomic64(Node, X86::ATOMAND6432);
+ case X86ISD::ATOMSWAP64_DAG:
+ return SelectAtomic64(Node, X86::ATOMSWAP6432);
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = Opcode == ISD::SMUL_LOHI;
+ if (!isSigned)
+ switch (NVT.getSimpleVT()) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
+ case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+ case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+ case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
+ }
+ else
+ switch (NVT.getSimpleVT()) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
+ case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+ case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+ case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+ }
+
+ unsigned LoReg, HiReg;
+ switch (NVT.getSimpleVT()) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: LoReg = X86::AL; HiReg = X86::AH; break;
+ case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; break;
+ case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
+ case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ // multiplty is commmutative
+ if (!foldedLoad) {
+ foldedLoad = TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ if (foldedLoad)
+ std::swap(N0, N1);
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+ N0, SDValue()).getValue(1);
+
+ if (foldedLoad) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ SDNode *CNode =
+ CurDAG->getTargetNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+ array_lengthof(Ops));
+ InFlag = SDValue(CNode, 1);
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+ } else {
+ InFlag =
+ SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+ }
+
+ // Copy the low half of the result, if it is needed.
+ if (!N.getValue(0).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ LoReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(N.getValue(0), Result);
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "=> ";
+ DEBUG(Result.getNode()->dump(CurDAG));
+ DOUT << "\n";
+#endif
+ }
+ // Copy the high half of the result, if it is needed.
+ if (!N.getValue(1).use_empty()) {
+ SDValue Result;
+ if (HiReg == X86::AH && Subtarget->is64Bit()) {
+ // Prevent use of AH in a REX instruction by referencing AX instead.
+ // Shift it down 8 bits.
+ Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ X86::AX, MVT::i16, InFlag);
+ InFlag = Result.getValue(2);
+ Result = SDValue(CurDAG->getTargetNode(X86::SHR16ri, dl, MVT::i16,
+ Result,
+ CurDAG->getTargetConstant(8, MVT::i8)), 0);
+ // Then truncate it down to i8.
+ SDValue SRIdx = CurDAG->getTargetConstant(X86::SUBREG_8BIT, MVT::i32);
+ Result = SDValue(CurDAG->getTargetNode(X86::EXTRACT_SUBREG, dl,
+ MVT::i8, Result, SRIdx), 0);
+ } else {
+ Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ HiReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ }
+ ReplaceUses(N.getValue(1), Result);
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "=> ";
+ DEBUG(Result.getNode()->dump(CurDAG));
+ DOUT << "\n";
+#endif
+ }
+
+#ifndef NDEBUG
+ Indent -= 2;
+#endif
+
+ return NULL;
+ }
+
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = Opcode == ISD::SDIVREM;
+ if (!isSigned)
+ switch (NVT.getSimpleVT()) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
+ case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+ case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+ case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+ }
+ else
+ switch (NVT.getSimpleVT()) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
+ case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+ case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+ case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+ }
+
+ unsigned LoReg, HiReg;
+ unsigned ClrOpcode, SExtOpcode;
+ switch (NVT.getSimpleVT()) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8:
+ LoReg = X86::AL; HiReg = X86::AH;
+ ClrOpcode = 0;
+ SExtOpcode = X86::CBW;
+ break;
+ case MVT::i16:
+ LoReg = X86::AX; HiReg = X86::DX;
+ ClrOpcode = X86::MOV16r0;
+ SExtOpcode = X86::CWD;
+ break;
+ case MVT::i32:
+ LoReg = X86::EAX; HiReg = X86::EDX;
+ ClrOpcode = X86::MOV32r0;
+ SExtOpcode = X86::CDQ;
+ break;
+ case MVT::i64:
+ LoReg = X86::RAX; HiReg = X86::RDX;
+ ClrOpcode = X86::MOV64r0;
+ SExtOpcode = X86::CQO;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+ SDValue InFlag;
+ if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+ // Special case for div8, just use a move with zero extension to AX to
+ // clear the upper 8 bits (AH).
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+ if (TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+ Move =
+ SDValue(CurDAG->getTargetNode(X86::MOVZX16rm8, dl, MVT::i16,
+ MVT::Other, Ops,
+ array_lengthof(Ops)), 0);
+ Chain = Move.getValue(1);
+ ReplaceUses(N0.getValue(1), Chain);
+ } else {
+ Move =
+ SDValue(CurDAG->getTargetNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0);
+ Chain = CurDAG->getEntryNode();
+ }
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue());
+ InFlag = Chain.getValue(1);
+ } else {
+ InFlag =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+ LoReg, N0, SDValue()).getValue(1);
+ if (isSigned && !signBitIsZero) {
+ // Sign extend the low part into the high part.
+ InFlag =
+ SDValue(CurDAG->getTargetNode(SExtOpcode, dl, MVT::Flag, InFlag),0);
+ } else {
+ // Zero out the high part, effectively zero extending the input.
+ SDValue ClrNode = SDValue(CurDAG->getTargetNode(ClrOpcode, dl, NVT),
+ 0);
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, HiReg,
+ ClrNode, InFlag).getValue(1);
+ }
+ }
+
+ if (foldedLoad) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ SDNode *CNode =
+ CurDAG->getTargetNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+ array_lengthof(Ops));
+ InFlag = SDValue(CNode, 1);
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+ } else {
+ InFlag =
+ SDValue(CurDAG->getTargetNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+ }
+
+ // Copy the division (low) result, if it is needed.
+ if (!N.getValue(0).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ LoReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(N.getValue(0), Result);
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "=> ";
+ DEBUG(Result.getNode()->dump(CurDAG));
+ DOUT << "\n";
+#endif
+ }
+ // Copy the remainder (high) result, if it is needed.
+ if (!N.getValue(1).use_empty()) {
+ SDValue Result;
+ if (HiReg == X86::AH && Subtarget->is64Bit()) {
+ // Prevent use of AH in a REX instruction by referencing AX instead.
+ // Shift it down 8 bits.
+ Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ X86::AX, MVT::i16, InFlag);
+ InFlag = Result.getValue(2);
+ Result = SDValue(CurDAG->getTargetNode(X86::SHR16ri, dl, MVT::i16,
+ Result,
+ CurDAG->getTargetConstant(8, MVT::i8)),
+ 0);
+ // Then truncate it down to i8.
+ SDValue SRIdx = CurDAG->getTargetConstant(X86::SUBREG_8BIT, MVT::i32);
+ Result = SDValue(CurDAG->getTargetNode(X86::EXTRACT_SUBREG, dl,
+ MVT::i8, Result, SRIdx), 0);
+ } else {
+ Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ HiReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ }
+ ReplaceUses(N.getValue(1), Result);
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "=> ";
+ DEBUG(Result.getNode()->dump(CurDAG));
+ DOUT << "\n";
+#endif
+ }
+
+#ifndef NDEBUG
+ Indent -= 2;
+#endif
+
+ return NULL;
+ }
+
+ case ISD::DECLARE: {
+ // Handle DECLARE nodes here because the second operand may have been
+ // wrapped in X86ISD::Wrapper.
+ SDValue Chain = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ SDValue N2 = Node->getOperand(2);
+ FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N1);
+
+ // FIXME: We need to handle this for VLAs.
+ if (!FINode) {
+ ReplaceUses(N.getValue(0), Chain);
+ return NULL;
+ }
+
+ if (N2.getOpcode() == ISD::ADD &&
+ N2.getOperand(0).getOpcode() == X86ISD::GlobalBaseReg)
+ N2 = N2.getOperand(1);
+
+ // If N2 is not Wrapper(decriptor) then the llvm.declare is mangled
+ // somehow, just ignore it.
+ if (N2.getOpcode() != X86ISD::Wrapper) {
+ ReplaceUses(N.getValue(0), Chain);
+ return NULL;
+ }
+ GlobalAddressSDNode *GVNode =
+ dyn_cast<GlobalAddressSDNode>(N2.getOperand(0));
+ if (GVNode == 0) {
+ ReplaceUses(N.getValue(0), Chain);
+ return NULL;
+ }
+ SDValue Tmp1 = CurDAG->getTargetFrameIndex(FINode->getIndex(),
+ TLI.getPointerTy());
+ SDValue Tmp2 = CurDAG->getTargetGlobalAddress(GVNode->getGlobal(),
+ TLI.getPointerTy());
+ SDValue Ops[] = { Tmp1, Tmp2, Chain };
+ return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
+ MVT::Other, Ops,
+ array_lengthof(Ops));
+ }
+ }
+
+ SDNode *ResNode = SelectCode(N);
+
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "=> ";
+ if (ResNode == NULL || ResNode == N.getNode())
+ DEBUG(N.getNode()->dump(CurDAG));
+ else
+ DEBUG(ResNode->dump(CurDAG));
+ DOUT << "\n";
+ Indent -= 2;
+#endif
+
+ return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+ std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1, Op2, Op3, Op4;
+ switch (ConstraintCode) {
+ case 'o': // offsetable ??
+ case 'v': // not offsetable ??
+ default: return true;
+ case 'm': // memory
+ if (!SelectAddr(Op, Op, Op0, Op1, Op2, Op3, Op4))
+ return true;
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ OutOps.push_back(Op2);
+ OutOps.push_back(Op3);
+ OutOps.push_back(Op4);
+ return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+ llvm::CodeGenOpt::Level OptLevel) {
+ return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..882ee3a
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,8794 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<bool>
+DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
+
+// Forward declarations.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+ SDValue V2);
+
+X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
+ : TargetLowering(TM) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
+ RegInfo = TM.getRegisterInfo();
+ TD = getTargetData();
+
+ // Set up the TargetLowering object.
+
+ // X86 is weird, it always uses i8 for shift amounts and setcc results.
+ setShiftAmountType(MVT::i8);
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setSchedulingPreference(SchedulingForRegPressure);
+ setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0
+ setStackPointerRegisterToSaveRestore(X86StackPtr);
+
+ if (Subtarget->isTargetDarwin()) {
+ // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+ setUseUnderscoreSetJmp(false);
+ setUseUnderscoreLongJmp(false);
+ } else if (Subtarget->isTargetMingw()) {
+ // MS runtime is weird: it exports _setjmp, but longjmp!
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(false);
+ } else {
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(true);
+ }
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, X86::GR8RegisterClass);
+ addRegisterClass(MVT::i16, X86::GR16RegisterClass);
+ addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+ if (Subtarget->is64Bit())
+ addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+ // We don't accept any truncstore of integer registers.
+ setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+ // SETOEQ and SETUNE require checking two conditions.
+ setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+ // operation.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
+ } else if (!UseSoftFloat) {
+ if (X86ScalarSSEf64) {
+ // We have an impenetrably clever algorithm for ui64->double only.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ }
+ // We have an algorithm for SSE2, and we turn this into a 64-bit
+ // FILD for other targets.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
+ }
+
+ // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
+
+ if (!UseSoftFloat && !NoImplicitFloat) {
+ // SSE has no i16 to fp conversion, only i32
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ }
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
+ }
+
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
+
+ // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
+
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ }
+
+ // Handle FP_TO_UINT by promoting the destination to a larger signed
+ // conversion.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
+ } else if (!UseSoftFloat) {
+ if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
+ // Expand FP_TO_UINT into a select.
+ // FIXME: We would like to use a Custom expander here eventually to do
+ // the optimal thing for SSE vs. the default expansion in the legalizer.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
+ else
+ // With SSE3 we can use fisttpll to convert to a signed i64; without
+ // SSE, we're stuck with a fistpll.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
+ }
+
+ // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+ if (!X86ScalarSSEf64) {
+ setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand);
+ setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand);
+ }
+
+ // Scalar integer divide and remainder are lowered to use operations that
+ // produce two results, to match the available instructions. This exposes
+ // the two-result form to trivial CSE, which is able to combine x/y and x%y
+ // into a single instruction.
+ //
+ // Scalar integer multiply-high is also lowered to use two-result
+ // operations, to match the available instructions. However, plain multiply
+ // (low) operations are left as Legal, as there are single-result
+ // instructions for this in x86. Using the two-result multiply instructions
+ // when both high and low results are needed must be arranged by dagcombine.
+ setOperationAction(ISD::MULHS , MVT::i8 , Expand);
+ setOperationAction(ISD::MULHU , MVT::i8 , Expand);
+ setOperationAction(ISD::SDIV , MVT::i8 , Expand);
+ setOperationAction(ISD::UDIV , MVT::i8 , Expand);
+ setOperationAction(ISD::SREM , MVT::i8 , Expand);
+ setOperationAction(ISD::UREM , MVT::i8 , Expand);
+ setOperationAction(ISD::MULHS , MVT::i16 , Expand);
+ setOperationAction(ISD::MULHU , MVT::i16 , Expand);
+ setOperationAction(ISD::SDIV , MVT::i16 , Expand);
+ setOperationAction(ISD::UDIV , MVT::i16 , Expand);
+ setOperationAction(ISD::SREM , MVT::i16 , Expand);
+ setOperationAction(ISD::UREM , MVT::i16 , Expand);
+ setOperationAction(ISD::MULHS , MVT::i32 , Expand);
+ setOperationAction(ISD::MULHU , MVT::i32 , Expand);
+ setOperationAction(ISD::SDIV , MVT::i32 , Expand);
+ setOperationAction(ISD::UDIV , MVT::i32 , Expand);
+ setOperationAction(ISD::SREM , MVT::i32 , Expand);
+ setOperationAction(ISD::UREM , MVT::i32 , Expand);
+ setOperationAction(ISD::MULHS , MVT::i64 , Expand);
+ setOperationAction(ISD::MULHU , MVT::i64 , Expand);
+ setOperationAction(ISD::SDIV , MVT::i64 , Expand);
+ setOperationAction(ISD::UDIV , MVT::i64 , Expand);
+ setOperationAction(ISD::SREM , MVT::i64 , Expand);
+ setOperationAction(ISD::UREM , MVT::i64 , Expand);
+
+ setOperationAction(ISD::BR_JT , MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND , MVT::Other, Custom);
+ setOperationAction(ISD::BR_CC , MVT::Other, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::Other, Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+ setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
+ setOperationAction(ISD::FREM , MVT::f32 , Expand);
+ setOperationAction(ISD::FREM , MVT::f64 , Expand);
+ setOperationAction(ISD::FREM , MVT::f80 , Expand);
+ setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
+
+ setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
+ setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
+ }
+
+ setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
+ setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
+
+ // These should be promoted to a larger select which is supported.
+ setOperationAction(ISD::SELECT , MVT::i1 , Promote);
+ setOperationAction(ISD::SELECT , MVT::i8 , Promote);
+ // X86 wants to expand cmov itself.
+ setOperationAction(ISD::SELECT , MVT::i16 , Custom);
+ setOperationAction(ISD::SELECT , MVT::i32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f64 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f80 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i8 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i16 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f80 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::SELECT , MVT::i64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i64 , Custom);
+ }
+ // X86 ret instruction may pop stack.
+ setOperationAction(ISD::RET , MVT::Other, Custom);
+ setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
+
+ // Darwin ABI issue.
+ setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+ setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
+ setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
+ }
+ // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+ setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
+ setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
+ setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
+ }
+
+ if (Subtarget->hasSSE1())
+ setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
+
+ if (!Subtarget->hasSSE2())
+ setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand);
+
+ // Expand certain atomics
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+
+ if (!Subtarget->is64Bit()) {
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+ }
+
+ // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
+ setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
+ // FIXME - use subtarget debug flags
+ if (!Subtarget->isTargetDarwin() &&
+ !Subtarget->isTargetELF() &&
+ !Subtarget->isTargetCygMing()) {
+ setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand);
+ setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+ setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
+ setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+ setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
+ if (Subtarget->is64Bit()) {
+ setExceptionPointerRegister(X86::RAX);
+ setExceptionSelectorRegister(X86::RDX);
+ } else {
+ setExceptionPointerRegister(X86::EAX);
+ setExceptionSelectorRegister(X86::EDX);
+ }
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+ setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+ setOperationAction(ISD::VASTART , MVT::Other, Custom);
+ setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::VAARG , MVT::Other, Custom);
+ setOperationAction(ISD::VACOPY , MVT::Other, Custom);
+ } else {
+ setOperationAction(ISD::VAARG , MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY , MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+ if (Subtarget->isTargetCygMing())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+ else
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+ if (!UseSoftFloat && X86ScalarSSEf64) {
+ // f32 and f64 use SSE.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+ addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+
+ // Use ANDPD to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f64, Custom);
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f64, Custom);
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ // Use ANDPD and ORPD to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+ // Expand FP immediates into loads from the stack, except for the special
+ // cases we handle.
+ addLegalFPImmediate(APFloat(+0.0)); // xorpd
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ } else if (!UseSoftFloat && X86ScalarSSEf32) {
+ // Use SSE for f32, x87 for f64.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+ addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+
+ // Use ANDPS to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+
+ // Use ANDPS and ORPS to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+ // Special cases we handle for FP constants.
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+ if (!UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64 , Expand);
+ setOperationAction(ISD::FCOS , MVT::f64 , Expand);
+ }
+ } else if (!UseSoftFloat) {
+ // f32 and f64 in x87.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+ addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+ setOperationAction(ISD::UNDEF, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+ if (!UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64 , Expand);
+ setOperationAction(ISD::FCOS , MVT::f64 , Expand);
+ }
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+ addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+ }
+
+ // Long double always uses X87.
+ if (!UseSoftFloat) {
+ addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
+ setOperationAction(ISD::UNDEF, MVT::f80, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+ {
+ bool ignored;
+ APFloat TmpFlt(+0.0);
+ TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+ &ignored);
+ addLegalFPImmediate(TmpFlt); // FLD0
+ TmpFlt.changeSign();
+ addLegalFPImmediate(TmpFlt); // FLD0/FCHS
+ APFloat TmpFlt2(+1.0);
+ TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+ &ignored);
+ addLegalFPImmediate(TmpFlt2); // FLD1
+ TmpFlt2.changeSign();
+ addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
+ }
+
+ if (!UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f80 , Expand);
+ setOperationAction(ISD::FCOS , MVT::f80 , Expand);
+ }
+ }
+
+ // Always use a library call for pow.
+ setOperationAction(ISD::FPOW , MVT::f32 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f64 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f80 , Expand);
+
+ setOperationAction(ISD::FLOG, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+
+ // First set operation action for all vector types to either promote
+ // (for widening) or expand (for scalarization). Then we will selectively
+ // turn on ones that can be effectively codegen'd.
+ for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+ VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+ setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
+ setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
+ }
+
+ // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+ // with -msoft-float, disable use of MMX as well.
+ if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
+ addRegisterClass(MVT::v8i8, X86::VR64RegisterClass);
+ addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
+ addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
+ addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
+ addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
+
+ setOperationAction(ISD::ADD, MVT::v8i8, Legal);
+ setOperationAction(ISD::ADD, MVT::v4i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v2i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::SUB, MVT::v8i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v4i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v2i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::MULHS, MVT::v4i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i16, Legal);
+
+ setOperationAction(ISD::AND, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::AND, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::AND, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::AND, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::OR, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::OR, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::OR, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::OR, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::XOR, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::XOR, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::XOR, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::XOR, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::LOAD, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand);
+ setOperationAction(ISD::SELECT, MVT::v8i8, Promote);
+ setOperationAction(ISD::SELECT, MVT::v4i16, Promote);
+ setOperationAction(ISD::SELECT, MVT::v2i32, Promote);
+ setOperationAction(ISD::SELECT, MVT::v1i64, Custom);
+ }
+
+ if (!UseSoftFloat && Subtarget->hasSSE1()) {
+ addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+
+ setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v4f32, Custom);
+ }
+
+ if (!UseSoftFloat && Subtarget->hasSSE2()) {
+ addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+
+ // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
+ // registers cannot be used even for integer operations.
+ addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+
+ setOperationAction(ISD::ADD, MVT::v16i8, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v4i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SUB, MVT::v16i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v8i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v4i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+ setOperationAction(ISD::FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
+
+ setOperationAction(ISD::VSETCC, MVT::v2f64, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v16i8, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v8i16, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+ for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
+ // Do not attempt to custom lower non-power-of-2 vectors
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
+ continue;
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+ }
+
+ // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+ for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+ setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote);
+ AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v2i64);
+ setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote);
+ AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v2i64);
+ setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote);
+ AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v2i64);
+ setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote);
+ AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v2i64);
+ setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote);
+ AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64);
+ }
+
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // Custom lower v2i64 and v2f64 selects.
+ setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
+ setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+
+ }
+
+ if (Subtarget->hasSSE41()) {
+ // FIXME: Do we need to handle scalar-to-vector here?
+ setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+
+ // i8 and i16 vectors are custom , because the source register and source
+ // source memory operand types are not the same width. f32 vectors are
+ // custom since the immediate controlling the insert encodes additional
+ // information.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+ }
+ }
+
+ if (Subtarget->hasSSE42()) {
+ setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
+ }
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+ // Add/Sub/Mul with overflow operations are custom lowered.
+ setOperationAction(ISD::SADDO, MVT::i32, Custom);
+ setOperationAction(ISD::SADDO, MVT::i64, Custom);
+ setOperationAction(ISD::UADDO, MVT::i32, Custom);
+ setOperationAction(ISD::UADDO, MVT::i64, Custom);
+ setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+ setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+ setOperationAction(ISD::USUBO, MVT::i32, Custom);
+ setOperationAction(ISD::USUBO, MVT::i64, Custom);
+ setOperationAction(ISD::SMULO, MVT::i32, Custom);
+ setOperationAction(ISD::SMULO, MVT::i64, Custom);
+ setOperationAction(ISD::UMULO, MVT::i32, Custom);
+ setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
+ if (!Subtarget->is64Bit()) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, 0);
+ setLibcallName(RTLIB::SRL_I128, 0);
+ setLibcallName(RTLIB::SRA_I128, 0);
+ }
+
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::STORE);
+ if (Subtarget->is64Bit())
+ setTargetDAGCombine(ISD::MUL);
+
+ computeRegisterProperties();
+
+ // FIXME: These should be based on subtarget info. Plus, the values should
+ // be smaller when we are in optimizing for size mode.
+ maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+ maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
+ maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
+ allowUnalignedMemoryAccesses = true; // x86 supports it!
+ setPrefLoopAlignment(16);
+ benefitFromCodePlacementOpt = true;
+}
+
+
+MVT X86TargetLowering::getSetCCResultType(MVT VT) const {
+ return MVT::i8;
+}
+
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
+ if (MaxAlign == 16)
+ return;
+ if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (VTy->getBitWidth() == 128)
+ MaxAlign = 16;
+ } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(ATy->getElementType(), EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(STy->getElementType(i), EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ if (MaxAlign == 16)
+ break;
+ }
+ }
+ return;
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+ if (Subtarget->is64Bit()) {
+ // Max of 8 and alignment of type.
+ unsigned TyAlign = TD->getABITypeAlignment(Ty);
+ if (TyAlign > 8)
+ return TyAlign;
+ return 8;
+ }
+
+ unsigned Align = 4;
+ if (Subtarget->hasSSE1())
+ getMaxByValAlign(Ty, Align);
+ return Align;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
+/// determining it.
+MVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
+ bool isSrcConst, bool isSrcStr) const {
+ // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
+ // linux. This is because the stack realignment code can't handle certain
+ // cases like PR2962. This should be removed when PR2962 is fixed.
+ if (!NoImplicitFloat && Subtarget->getStackAlignment() >= 16) {
+ if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
+ return MVT::v4i32;
+ if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
+ return MVT::v4f32;
+ }
+ if (Subtarget->is64Bit() && Size >= 8)
+ return MVT::i64;
+ return MVT::i32;
+}
+
+/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+/// jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ if (usesGlobalOffsetTable())
+ return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
+ if (!Subtarget->isPICStyleRIPRel())
+ // This doesn't have DebugLoc associated with it, but is not really the
+ // same as a Register.
+ return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
+ getPointerTy());
+ return Table;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+/// LowerRET - Lower an ISD::RET node.
+SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args");
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+ bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+ CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86);
+
+ // If this is the first return lowered for this function, add the regs to the
+ // liveout set for the function.
+ if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+ for (unsigned i = 0; i != RVLocs.size(); ++i)
+ if (RVLocs[i].isRegLoc())
+ DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+ }
+ SDValue Chain = Op.getOperand(0);
+
+ // Handle tail call return.
+ Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL);
+ if (Chain.getOpcode() == X86ISD::TAILCALL) {
+ SDValue TailCall = Chain;
+ SDValue TargetAddress = TailCall.getOperand(1);
+ SDValue StackAdjustment = TailCall.getOperand(2);
+ assert(((TargetAddress.getOpcode() == ISD::Register &&
+ (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX ||
+ cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
+ TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
+ TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
+ "Expecting an global address, external symbol, or register");
+ assert(StackAdjustment.getOpcode() == ISD::Constant &&
+ "Expecting a const value");
+
+ SmallVector<SDValue,8> Operands;
+ Operands.push_back(Chain.getOperand(0));
+ Operands.push_back(TargetAddress);
+ Operands.push_back(StackAdjustment);
+ // Copy registers used by the call. Last operand is a flag so it is not
+ // copied.
+ for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) {
+ Operands.push_back(Chain.getOperand(i));
+ }
+ return DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, &Operands[0],
+ Operands.size());
+ }
+
+ // Regular return.
+ SDValue Flag;
+
+ SmallVector<SDValue, 6> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ // Operand #1 = Bytes To Pop
+ RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16));
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ SDValue ValToCopy = Op.getOperand(i*2+1);
+
+ // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+ // the RET instruction and handled by the FP Stackifier.
+ if (VA.getLocReg() == X86::ST0 ||
+ VA.getLocReg() == X86::ST1) {
+ // If this is a copy from an xmm register to ST(0), use an FPExtend to
+ // change the value to the FP stack register class.
+ if (isScalarFPTypeInSSEReg(VA.getValVT()))
+ ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+ RetOps.push_back(ValToCopy);
+ // Don't emit a copytoreg.
+ continue;
+ }
+
+ // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+ // which is returned in RAX / RDX.
+ if (Subtarget->is64Bit()) {
+ MVT ValVT = ValToCopy.getValueType();
+ if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
+ ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
+ if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
+ ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
+ }
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
+ Flag = Chain.getValue(1);
+ }
+
+ // The x86-64 ABI for returning structs by value requires that we copy
+ // the sret argument into %rax for the return. We saved the argument into
+ // a virtual register in the entry block, so now we copy the value out
+ // and into %rax.
+ if (Subtarget->is64Bit() &&
+ DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+ FuncInfo->setSRetReturnReg(Reg);
+ }
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+
+ Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
+ Flag = Chain.getValue(1);
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(X86ISD::RET_FLAG, dl,
+ MVT::Other, &RetOps[0], RetOps.size());
+}
+
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers. This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. The returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDNode *X86TargetLowering::
+LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
+ unsigned CallingConv, SelectionDAG &DAG) {
+
+ DebugLoc dl = TheCall->getDebugLoc();
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ bool isVarArg = TheCall->isVarArg();
+ bool Is64Bit = Subtarget->is64Bit();
+ CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+ CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
+
+ SmallVector<SDValue, 8> ResultVals;
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ MVT CopyVT = VA.getValVT();
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+ ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) {
+ cerr << "SSE register return with SSE disabled\n";
+ exit(1);
+ }
+
+ // If this is a call to a function that returns an fp value on the floating
+ // point stack, but where we prefer to use the value in xmm registers, copy
+ // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+ if ((VA.getLocReg() == X86::ST0 ||
+ VA.getLocReg() == X86::ST1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
+ CopyVT = MVT::f80;
+ }
+
+ SDValue Val;
+ if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
+ // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
+ if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+ MVT::v2i64, InFlag).getValue(1);
+ Val = Chain.getValue(0);
+ Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ Val, DAG.getConstant(0, MVT::i64));
+ } else {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+ MVT::i64, InFlag).getValue(1);
+ Val = Chain.getValue(0);
+ }
+ Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
+ } else {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+ CopyVT, InFlag).getValue(1);
+ Val = Chain.getValue(0);
+ }
+ InFlag = Chain.getValue(2);
+
+ if (CopyVT != VA.getValVT()) {
+ // Round the F80 the right size, which also moves to the appropriate xmm
+ // register.
+ Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+ // This truncation won't change the value.
+ DAG.getIntPtrConstant(1));
+ }
+
+ ResultVals.push_back(Val);
+ }
+
+ // Merge everything together with a MERGE_VALUES node.
+ ResultVals.push_back(Chain);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(),
+ &ResultVals[0], ResultVals.size()).getNode();
+}
+
+
+//===----------------------------------------------------------------------===//
+// C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+// StdCall calling convention seems to be standard for many Windows' API
+// routines and around. It differs from C calling convention just a little:
+// callee should clean up the stack, not caller. Symbols should be also
+// decorated in some fancy way :) It doesn't support any vector arguments.
+// For info on fast calling convention see Fast Calling Convention (tail call)
+// implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a CALL node uses struct return
+/// semantics.
+static bool CallIsStructReturn(CallSDNode *TheCall) {
+ unsigned NumOps = TheCall->getNumArgs();
+ if (!NumOps)
+ return false;
+
+ return TheCall->getArgFlags(0).isSRet();
+}
+
+/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct
+/// return semantics.
+static bool ArgsAreStructReturn(SDValue Op) {
+ unsigned NumArgs = Op.getNode()->getNumValues() - 1;
+ if (!NumArgs)
+ return false;
+
+ return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet();
+}
+
+/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires
+/// the callee to pop its own arguments. Callee pop is necessary to support tail
+/// calls.
+bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
+ if (IsVarArg)
+ return false;
+
+ switch (CallingConv) {
+ default:
+ return false;
+ case CallingConv::X86_StdCall:
+ return !Subtarget->is64Bit();
+ case CallingConv::X86_FastCall:
+ return !Subtarget->is64Bit();
+ case CallingConv::Fast:
+ return PerformTailCallOpt;
+ }
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
+/// given CallingConvention value.
+CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
+ if (Subtarget->is64Bit()) {
+ if (Subtarget->isTargetWin64())
+ return CC_X86_Win64_C;
+ else if (CC == CallingConv::Fast && PerformTailCallOpt)
+ return CC_X86_64_TailCall;
+ else
+ return CC_X86_64_C;
+ }
+
+ if (CC == CallingConv::X86_FastCall)
+ return CC_X86_32_FastCall;
+ else if (CC == CallingConv::Fast)
+ return CC_X86_32_FastCC;
+ else
+ return CC_X86_32_C;
+}
+
+/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to
+/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node.
+NameDecorationStyle
+X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) {
+ unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (CC == CallingConv::X86_FastCall)
+ return FastCall;
+ else if (CC == CallingConv::X86_StdCall)
+ return StdCall;
+ return None;
+}
+
+
+/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer
+/// in a register before calling.
+bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) {
+ return !IsTailCall && !Is64Bit &&
+ getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT();
+}
+
+/// CallRequiresFnAddressInReg - Check whether the call requires the function
+/// address to be loaded in a register.
+bool
+X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) {
+ return !Is64Bit && IsTailCall &&
+ getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT();
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" with size and alignment information specified by
+/// the specific parameter attribute. The copy will be passed as a byval
+/// function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+ ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+ DebugLoc dl) {
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+ return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+ /*AlwaysInline=*/true, NULL, 0, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ MachineFrameInfo *MFI,
+ unsigned CC,
+ SDValue Root, unsigned i) {
+ // Create the nodes corresponding to a load from this parameter slot.
+ ISD::ArgFlagsTy Flags =
+ cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags();
+ bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt;
+ bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+
+ // FIXME: For now, all byval parameter objects are marked mutable. This can be
+ // changed with more analysis.
+ // In case of tail call optimization mark all arguments mutable. Since they
+ // could be overwritten by lowering of arguments in case of a tail call.
+ int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+ VA.getLocMemOffset(), isImmutable);
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+ if (Flags.isByVal())
+ return FIN;
+ return DAG.getLoad(VA.getValVT(), Op.getDebugLoc(), Root, FIN,
+ PseudoSourceValue::getFixedStack(FI), 0);
+}
+
+SDValue
+X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ DebugLoc dl = Op.getDebugLoc();
+
+ const Function* Fn = MF.getFunction();
+ if (Fn->hasExternalLinkage() &&
+ Subtarget->isTargetCygMing() &&
+ Fn->getName() == "main")
+ FuncInfo->setForceFramePointer(true);
+
+ // Decorate the function name.
+ FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op));
+
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ SDValue Root = Op.getOperand(0);
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+ unsigned CC = MF.getFunction()->getCallingConv();
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsWin64 = Subtarget->isTargetWin64();
+
+ assert(!(isVarArg && CC == CallingConv::Fast) &&
+ "Var args not supported with calling convention fastcc");
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+ CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC));
+
+ SmallVector<SDValue, 8> ArgValues;
+ unsigned LastVal = ~0U;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+ // places.
+ assert(VA.getValNo() != LastVal &&
+ "Don't support value assigned to multiple locs yet");
+ LastVal = VA.getValNo();
+
+ if (VA.isRegLoc()) {
+ MVT RegVT = VA.getLocVT();
+ TargetRegisterClass *RC = NULL;
+ if (RegVT == MVT::i32)
+ RC = X86::GR32RegisterClass;
+ else if (Is64Bit && RegVT == MVT::i64)
+ RC = X86::GR64RegisterClass;
+ else if (RegVT == MVT::f32)
+ RC = X86::FR32RegisterClass;
+ else if (RegVT == MVT::f64)
+ RC = X86::FR64RegisterClass;
+ else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
+ RC = X86::VR128RegisterClass;
+ else if (RegVT.isVector()) {
+ assert(RegVT.getSizeInBits() == 64);
+ if (!Is64Bit)
+ RC = X86::VR64RegisterClass; // MMX values are passed in MMXs.
+ else {
+ // Darwin calling convention passes MMX values in either GPRs or
+ // XMMs in x86-64. Other targets pass them in memory.
+ if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) {
+ RC = X86::VR128RegisterClass; // MMX values are passed in XMMs.
+ RegVT = MVT::v2i64;
+ } else {
+ RC = X86::GR64RegisterClass; // v1i64 values are passed in GPRs.
+ RegVT = MVT::i64;
+ }
+ }
+ } else {
+ assert(0 && "Unknown argument type!");
+ }
+
+ unsigned Reg = DAG.getMachineFunction().addLiveIn(VA.getLocReg(), RC);
+ SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+
+ if (VA.getLocInfo() != CCValAssign::Full)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+ // Handle MMX values passed in GPRs.
+ if (Is64Bit && RegVT != VA.getLocVT()) {
+ if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass)
+ ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue);
+ else if (RC == X86::VR128RegisterClass) {
+ ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ ArgValue, DAG.getConstant(0, MVT::i64));
+ ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue);
+ }
+ }
+
+ ArgValues.push_back(ArgValue);
+ } else {
+ assert(VA.isMemLoc());
+ ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i));
+ }
+ }
+
+ // The x86-64 ABI for returning structs by value requires that we copy
+ // the sret argument into %rax for the return. Save the argument into
+ // a virtual register so that we can access it from the return points.
+ if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+ FuncInfo->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]);
+ Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root);
+ }
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+ // align stack specially for tail calls
+ if (PerformTailCallOpt && CC == CallingConv::Fast)
+ StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start.
+ if (isVarArg) {
+ if (Is64Bit || CC != CallingConv::X86_FastCall) {
+ VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
+ }
+ if (Is64Bit) {
+ unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
+
+ // FIXME: We should really autogenerate these arrays
+ static const unsigned GPR64ArgRegsWin64[] = {
+ X86::RCX, X86::RDX, X86::R8, X86::R9
+ };
+ static const unsigned XMMArgRegsWin64[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
+ };
+ static const unsigned GPR64ArgRegs64Bit[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+ };
+ static const unsigned XMMArgRegs64Bit[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ const unsigned *GPR64ArgRegs, *XMMArgRegs;
+
+ if (IsWin64) {
+ TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
+ GPR64ArgRegs = GPR64ArgRegsWin64;
+ XMMArgRegs = XMMArgRegsWin64;
+ } else {
+ TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
+ GPR64ArgRegs = GPR64ArgRegs64Bit;
+ XMMArgRegs = XMMArgRegs64Bit;
+ }
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
+ TotalNumIntRegs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
+ TotalNumXMMRegs);
+
+ assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+ "SSE register cannot be used when SSE is disabled!");
+ assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloat) &&
+ "SSE register cannot be used when SSE is disabled!");
+ if (UseSoftFloat || NoImplicitFloat || !Subtarget->hasSSE1())
+ // Kernel mode asks for SSE to be disabled, so don't push them
+ // on the stack.
+ TotalNumXMMRegs = 0;
+
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so they
+ // may be loaded by deferencing the result of va_next.
+ VarArgsGPOffset = NumIntRegs * 8;
+ VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
+ RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
+ TotalNumXMMRegs * 16, 16);
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+ SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+ DAG.getIntPtrConstant(VarArgsGPOffset));
+ for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+ unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
+ X86::GR64RegisterClass);
+ SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
+ MemOps.push_back(Store);
+ FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+ DAG.getIntPtrConstant(8));
+ }
+
+ // Now store the XMM (fp + vector) parameter registers.
+ FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+ DAG.getIntPtrConstant(VarArgsFPOffset));
+ for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
+ unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
+ X86::VR128RegisterClass);
+ SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::v4f32);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
+ MemOps.push_back(Store);
+ FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+ DAG.getIntPtrConstant(16));
+ }
+ if (!MemOps.empty())
+ Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOps[0], MemOps.size());
+ }
+ }
+
+ ArgValues.push_back(Root);
+
+ // Some CCs need callee pop.
+ if (IsCalleePop(isVarArg, CC)) {
+ BytesToPopOnReturn = StackSize; // Callee pops everything.
+ BytesCallerReserves = 0;
+ } else {
+ BytesToPopOnReturn = 0; // Callee pops nothing.
+ // If this is an sret function, the return should pop the hidden pointer.
+ if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op))
+ BytesToPopOnReturn = 4;
+ BytesCallerReserves = StackSize;
+ }
+
+ if (!Is64Bit) {
+ RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only.
+ if (CC == CallingConv::X86_FastCall)
+ VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs.
+ }
+
+ FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
+
+ // Return the new list of results.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
+ &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
+}
+
+SDValue
+X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
+ const SDValue &StackPtr,
+ const CCValAssign &VA,
+ SDValue Chain,
+ SDValue Arg, ISD::ArgFlagsTy Flags) {
+ DebugLoc dl = TheCall->getDebugLoc();
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+ PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+ if (Flags.isByVal()) {
+ return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+ }
+ return DAG.getStore(Chain, dl, Arg, PtrOff,
+ PseudoSourceValue::getStack(), LocMemOffset);
+}
+
+/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue
+X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+ SDValue &OutRetAddr,
+ SDValue Chain,
+ bool IsTailCall,
+ bool Is64Bit,
+ int FPDiff,
+ DebugLoc dl) {
+ if (!IsTailCall || FPDiff==0) return Chain;
+
+ // Adjust the Return address stack slot.
+ MVT VT = getPointerTy();
+ OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+ // Load the "old" Return address.
+ OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0);
+ return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue
+EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ bool Is64Bit, int FPDiff, DebugLoc dl) {
+ // Store the return address to the appropriate stack slot.
+ if (!FPDiff) return Chain;
+ // Calculate the new stack slot for the return address.
+ int SlotSize = Is64Bit ? 8 : 4;
+ int NewReturnAddrFI =
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
+ MVT VT = Is64Bit ? MVT::i64 : MVT::i32;
+ SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+ Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+ PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
+ return Chain;
+}
+
+SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+ SDValue Chain = TheCall->getChain();
+ unsigned CC = TheCall->getCallingConv();
+ bool isVarArg = TheCall->isVarArg();
+ bool IsTailCall = TheCall->isTailCall() &&
+ CC == CallingConv::Fast && PerformTailCallOpt;
+ SDValue Callee = TheCall->getCallee();
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsStructRet = CallIsStructReturn(TheCall);
+ DebugLoc dl = TheCall->getDebugLoc();
+
+ assert(!(isVarArg && CC == CallingConv::Fast) &&
+ "Var args not supported with calling convention fastcc");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+ CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC));
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+ if (PerformTailCallOpt && CC == CallingConv::Fast)
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+ int FPDiff = 0;
+ if (IsTailCall) {
+ // Lower arguments at fp - stackoffset + fpdiff.
+ unsigned NumBytesCallerPushed =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+ FPDiff = NumBytesCallerPushed - NumBytes;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
+ MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+ }
+
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+ SDValue RetAddrFrIdx;
+ // Load return adress for tail calls.
+ Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit,
+ FPDiff, dl);
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+ SDValue StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads. In the case
+ // of tail call optimization arguments are handle later.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = TheCall->getArg(i);
+ ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+ bool isByVal = Flags.isByVal();
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: assert(0 && "Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc()) {
+ if (Is64Bit) {
+ MVT RegVT = VA.getLocVT();
+ if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
+ switch (VA.getLocReg()) {
+ default:
+ break;
+ case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX:
+ case X86::R8: {
+ // Special case: passing MMX values in GPR registers.
+ Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+ break;
+ }
+ case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3:
+ case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: {
+ // Special case: passing MMX values in XMM registers.
+ Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+ Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+ Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+ break;
+ }
+ }
+ }
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ if (!IsTailCall || (IsTailCall && isByVal)) {
+ assert(VA.isMemLoc());
+ if (StackPtr.getNode() == 0)
+ StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
+
+ MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
+ Chain, Arg, Flags));
+ }
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOpChains[0], MemOpChains.size());
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ // Tail call byval lowering might overwrite argument registers so in case of
+ // tail call optimization the copies to registers are lowered later.
+ if (!IsTailCall)
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) {
+ Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc::getUnknownLoc(),
+ getPointerTy()),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+ // If we are tail calling and generating PIC/GOT style code load the address
+ // of the callee into ecx. The value in ecx is used as target of the tail
+ // jump. This is done to circumvent the ebx/callee-saved problem for tail
+ // calls on PIC/GOT architectures. Normally we would just put the address of
+ // GOT into ebx and then call target@PLT. But for tail callss ebx would be
+ // restored (since ebx is callee saved) before jumping to the target@PLT.
+ if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
+ // Note: The actual moving to ecx is done further down.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (G && !G->getGlobal()->hasHiddenVisibility() &&
+ !G->getGlobal()->hasProtectedVisibility())
+ Callee = LowerGlobalAddress(Callee, DAG);
+ else if (isa<ExternalSymbolSDNode>(Callee))
+ Callee = LowerExternalSymbol(Callee,DAG);
+ }
+
+ if (Is64Bit && isVarArg) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // FIXME: Verify this on Win64
+ // Count the number of XMM registers allocated.
+ static const unsigned XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+ assert((Subtarget->hasSSE1() || !NumXMMRegs)
+ && "SSE registers cannot be used when SSE is disabled");
+
+ Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
+ DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+
+ // For tail calls lower the arguments to the 'real' stack slot.
+ if (IsTailCall) {
+ SmallVector<SDValue, 8> MemOpChains2;
+ SDValue FIN;
+ int FI = 0;
+ // Do not flag preceeding copytoreg stuff together with the following stuff.
+ InFlag = SDValue();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (!VA.isRegLoc()) {
+ assert(VA.isMemLoc());
+ SDValue Arg = TheCall->getArg(i);
+ ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
+ FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+ if (StackPtr.getNode() == 0)
+ StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
+ getPointerTy());
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(
+ DAG.getStore(Chain, dl, Arg, FIN,
+ PseudoSourceValue::getFixedStack(FI), 0));
+ }
+ }
+ }
+
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOpChains2[0], MemOpChains2.size());
+
+ // Copy arguments to their registers.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+ InFlag =SDValue();
+
+ // Store the return address to the appropriate stack slot.
+ Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
+ FPDiff, dl);
+ }
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+ getTargetMachine(), true))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(),
+ G->getOffset());
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+ } else if (IsTailCall) {
+ unsigned Opc = Is64Bit ? X86::R9 : X86::EAX;
+
+ Chain = DAG.getCopyToReg(Chain, dl,
+ DAG.getRegister(Opc, getPointerTy()),
+ Callee,InFlag);
+ Callee = DAG.getRegister(Opc, getPointerTy());
+ // Add register as live out.
+ DAG.getMachineFunction().getRegInfo().addLiveOut(Opc);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDValue, 8> Ops;
+
+ if (IsTailCall) {
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+ DAG.getIntPtrConstant(0, true), InFlag);
+ InFlag = Chain.getValue(1);
+
+ // Returns a chain & a flag for retval copy to use.
+ NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ Ops.clear();
+ }
+
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (IsTailCall)
+ Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add an implicit use GOT pointer in EBX.
+ if (!IsTailCall && !Is64Bit &&
+ getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT())
+ Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+
+ // Add an implicit use of AL for x86 vararg functions.
+ if (Is64Bit && isVarArg)
+ Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ if (IsTailCall) {
+ assert(InFlag.getNode() &&
+ "Flag must be set. Depend on flag being set in LowerRET");
+ Chain = DAG.getNode(X86ISD::TAILCALL, dl,
+ TheCall->getVTList(), &Ops[0], Ops.size());
+
+ return SDValue(Chain.getNode(), Op.getResNo());
+ }
+
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ unsigned NumBytesForCalleeToPush;
+ if (IsCalleePop(isVarArg, CC))
+ NumBytesForCalleeToPush = NumBytes; // Callee pops everything
+ else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet)
+ // If this is is a call to a struct-return function, the callee
+ // pops the hidden struct pointer, so we have to push it back.
+ // This is common for Darwin/X86, Linux & Mingw32 targets.
+ NumBytesForCalleeToPush = 4;
+ else
+ NumBytesForCalleeToPush = 0; // Callee pops nothing.
+
+ // Returns a flag for retval copy to use.
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytes, true),
+ DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+ true),
+ InFlag);
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
+ Op.getResNo());
+}
+
+
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// On X86_64 architecture with GOT-style position independent code only local
+// (within module) calls are supported at the moment.
+// To keep the stack aligned according to platform abi the function
+// GetAlignedArgumentStackSize ensures that argument delta is always multiples
+// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+// If a tail called function callee has more arguments than the caller the
+// caller needs to make sure that there is room to move the RETADDR to. This is
+// achieved by reserving an area the size of the argument delta right after the
+// original REtADDR, but before the saved framepointer or the spilled registers
+// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+// stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG& DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameInfo &TFI = *TM.getFrameInfo();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ uint64_t AlignMask = StackAlignment - 1;
+ int64_t Offset = StackSize;
+ uint64_t SlotSize = TD->getPointerSize();
+ if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+ // Number smaller than 12 so just add the difference.
+ Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+ } else {
+ // Mask out lower bits, add stackalignment once plus the 12 bytes.
+ Offset = ((~AlignMask) & Offset) + StackAlignment +
+ (StackAlignment-SlotSize);
+ }
+ return Offset;
+}
+
+/// IsEligibleForTailCallElimination - Check to see whether the next instruction
+/// following the call is a return. A function is eligible if caller/callee
+/// calling conventions match, currently only fastcc supports tail calls, and
+/// the function CALL is immediatly followed by a RET.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
+ SDValue Ret,
+ SelectionDAG& DAG) const {
+ if (!PerformTailCallOpt)
+ return false;
+
+ if (CheckTailCallReturnConstraints(TheCall, Ret)) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned CallerCC = MF.getFunction()->getCallingConv();
+ unsigned CalleeCC= TheCall->getCallingConv();
+ if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+ SDValue Callee = TheCall->getCallee();
+ // On x86/32Bit PIC/GOT tail calls are supported.
+ if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
+ !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit())
+ return true;
+
+ // Can only do local tail calls (in same module, hidden or protected) on
+ // x86_64 PIC/GOT at the moment.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+ return G->getGlobal()->hasHiddenVisibility()
+ || G->getGlobal()->hasProtectedVisibility();
+ }
+ }
+
+ return false;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(MachineFunction &mf,
+ MachineModuleInfo *mmo,
+ DwarfWriter *dw,
+ DenseMap<const Value *, unsigned> &vm,
+ DenseMap<const BasicBlock *,
+ MachineBasicBlock *> &bm,
+ DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+ , SmallSet<Instruction*, 8> &cil
+#endif
+ ) {
+ return X86::createFastISel(mf, mmo, dw, vm, bm, am
+#ifndef NDEBUG
+ , cil
+#endif
+ );
+}
+
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+ if (ReturnAddrIndex == 0) {
+ // Set up a frame object for the return address.
+ uint64_t SlotSize = TD->getPointerSize();
+ ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize);
+ FuncInfo->setRAIndex(ReturnAddrIndex);
+ }
+
+ return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+
+/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+ SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+ if (!isFP) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+ // X > -1 -> X == 0, jump !sign.
+ RHS = DAG.getConstant(0, RHS.getValueType());
+ return X86::COND_NS;
+ } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ // X < 0 -> X == 0, jump on sign.
+ return X86::COND_S;
+ } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ // X < 1 -> X <= 0
+ RHS = DAG.getConstant(0, RHS.getValueType());
+ return X86::COND_LE;
+ }
+ }
+
+ switch (SetCCOpcode) {
+ default: assert(0 && "Invalid integer condition!");
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETGT: return X86::COND_G;
+ case ISD::SETGE: return X86::COND_GE;
+ case ISD::SETLT: return X86::COND_L;
+ case ISD::SETLE: return X86::COND_LE;
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETULT: return X86::COND_B;
+ case ISD::SETUGT: return X86::COND_A;
+ case ISD::SETULE: return X86::COND_BE;
+ case ISD::SETUGE: return X86::COND_AE;
+ }
+ }
+
+ // First determine if it is required or is profitable to flip the operands.
+
+ // If LHS is a foldable load, but RHS is not, flip the condition.
+ if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
+ !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
+ SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+ std::swap(LHS, RHS);
+ }
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETOLT:
+ case ISD::SETOLE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ std::swap(LHS, RHS);
+ break;
+ }
+
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ switch (SetCCOpcode) {
+ default: assert(0 && "Condcode should be pre-legalized away");
+ case ISD::SETUEQ:
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETOLT: // flipped
+ case ISD::SETOGT:
+ case ISD::SETGT: return X86::COND_A;
+ case ISD::SETOLE: // flipped
+ case ISD::SETOGE:
+ case ISD::SETGE: return X86::COND_AE;
+ case ISD::SETUGT: // flipped
+ case ISD::SETULT:
+ case ISD::SETLT: return X86::COND_B;
+ case ISD::SETUGE: // flipped
+ case ISD::SETULE:
+ case ISD::SETLE: return X86::COND_BE;
+ case ISD::SETONE:
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETUO: return X86::COND_P;
+ case ISD::SETO: return X86::COND_NP;
+ }
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+ switch (X86CC) {
+ default:
+ return false;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_E:
+ case X86::COND_P:
+ case X86::COND_A:
+ case X86::COND_AE:
+ case X86::COND_NE:
+ case X86::COND_NP:
+ return true;
+ }
+}
+
+/// isUndefOrInRange - Return true if Val is undef or if its value falls within
+/// the specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+ return (Val < 0) || (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
+/// specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+ if (Val < 0 || Val == CmpVal)
+ return true;
+ return false;
+}
+
+/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
+/// the second operand.
+static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+ if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
+ return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
+ if (VT == MVT::v2f64 || VT == MVT::v2i64)
+ return (Mask[0] < 2 && Mask[1] < 2);
+ return false;
+}
+
+bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isPSHUFDMask(M, N->getValueType(0));
+}
+
+/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFHW.
+static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+ if (VT != MVT::v8i16)
+ return false;
+
+ // Lower quadword copied in order or undef.
+ for (int i = 0; i != 4; ++i)
+ if (Mask[i] >= 0 && Mask[i] != i)
+ return false;
+
+ // Upper quadword shuffled.
+ for (int i = 4; i != 8; ++i)
+ if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
+ return false;
+
+ return true;
+}
+
+bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isPSHUFHWMask(M, N->getValueType(0));
+}
+
+/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFLW.
+static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+ if (VT != MVT::v8i16)
+ return false;
+
+ // Upper quadword copied in order.
+ for (int i = 4; i != 8; ++i)
+ if (Mask[i] >= 0 && Mask[i] != i)
+ return false;
+
+ // Lower quadword shuffled.
+ for (int i = 0; i != 4; ++i)
+ if (Mask[i] >= 4)
+ return false;
+
+ return true;
+}
+
+bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isPSHUFLWMask(M, N->getValueType(0));
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+ int NumElems = VT.getVectorNumElements();
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ int Half = NumElems / 2;
+ for (int i = 0; i < Half; ++i)
+ if (!isUndefOrInRange(Mask[i], 0, NumElems))
+ return false;
+ for (int i = Half; i < NumElems; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+ return false;
+
+ return true;
+}
+
+bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isSHUFPMask(M, N->getValueType(0));
+}
+
+/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
+/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
+/// half elements to come from vector 1 (which would equal the dest.) and
+/// the upper half to come from vector 2.
+static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+ int NumElems = VT.getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ int Half = NumElems / 2;
+ for (int i = 0; i < Half; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+ return false;
+ for (int i = Half; i < NumElems; ++i)
+ if (!isUndefOrInRange(Mask[i], 0, NumElems))
+ return false;
+ return true;
+}
+
+static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return isCommutedSHUFPMask(M, N->getValueType(0));
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
+ if (N->getValueType(0).getVectorNumElements() != 4)
+ return false;
+
+ // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+ return isUndefOrEqual(N->getMaskElt(0), 6) &&
+ isUndefOrEqual(N->getMaskElt(1), 7) &&
+ isUndefOrEqual(N->getMaskElt(2), 2) &&
+ isUndefOrEqual(N->getMaskElt(3), 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
+ unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ for (unsigned i = 0; i < NumElems/2; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
+ return false;
+
+ for (unsigned i = NumElems/2; i < NumElems; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), i))
+ return false;
+
+ return true;
+}
+
+/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
+/// and MOVLHPS.
+bool X86::isMOVHPMask(ShuffleVectorSDNode *N) {
+ unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ for (unsigned i = 0; i < NumElems/2; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), i))
+ return false;
+
+ for (unsigned i = 0; i < NumElems/2; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
+ return false;
+
+ return true;
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
+ unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+ if (NumElems != 4)
+ return false;
+
+ return isUndefOrEqual(N->getMaskElt(0), 2) &&
+ isUndefOrEqual(N->getMaskElt(1), 3) &&
+ isUndefOrEqual(N->getMaskElt(2), 2) &&
+ isUndefOrEqual(N->getMaskElt(3), 3);
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, MVT VT,
+ bool V2IsSplat = false) {
+ int NumElts = VT.getVectorNumElements();
+ if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+ return false;
+
+ for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (V2IsSplat) {
+ if (!isUndefOrEqual(BitI1, NumElts))
+ return false;
+ } else {
+ if (!isUndefOrEqual(BitI1, j + NumElts))
+ return false;
+ }
+ }
+ return true;
+}
+
+bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, MVT VT,
+ bool V2IsSplat = false) {
+ int NumElts = VT.getVectorNumElements();
+ if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+ return false;
+
+ for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j + NumElts/2))
+ return false;
+ if (V2IsSplat) {
+ if (isUndefOrEqual(BitI1, NumElts))
+ return false;
+ } else {
+ if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+ return false;
+ }
+ }
+ return true;
+}
+
+bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) {
+ int NumElems = VT.getVectorNumElements();
+ if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+ return false;
+
+ for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
+ return true;
+}
+
+bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) {
+ int NumElems = VT.getVectorNumElements();
+ if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+ return false;
+
+ for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
+ return true;
+}
+
+bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT) {
+ int NumElts = VT.getVectorNumElements();
+ if (NumElts != 2 && NumElts != 4)
+ return false;
+
+ if (!isUndefOrEqual(Mask[0], NumElts))
+ return false;
+
+ for (int i = 1; i < NumElts; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
+ return false;
+
+ return true;
+}
+
+bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isMOVLMask(M, N->getValueType(0));
+}
+
+/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT,
+ bool V2IsSplat = false, bool V2IsUndef = false) {
+ int NumOps = VT.getVectorNumElements();
+ if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+ return false;
+
+ if (!isUndefOrEqual(Mask[0], 0))
+ return false;
+
+ for (int i = 1; i < NumOps; ++i)
+ if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
+ (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
+ (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
+ return false;
+
+ return true;
+}
+
+static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
+ bool V2IsUndef = false) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
+ if (N->getValueType(0).getVectorNumElements() != 4)
+ return false;
+
+ // Expect 1, 1, 3, 3
+ for (unsigned i = 0; i < 2; ++i) {
+ int Elt = N->getMaskElt(i);
+ if (Elt >= 0 && Elt != 1)
+ return false;
+ }
+
+ bool HasHi = false;
+ for (unsigned i = 2; i < 4; ++i) {
+ int Elt = N->getMaskElt(i);
+ if (Elt >= 0 && Elt != 3)
+ return false;
+ if (Elt == 3)
+ HasHi = true;
+ }
+ // Don't use movshdup if it can be done with a shufps.
+ // FIXME: verify that matching u, u, 3, 3 is what we want.
+ return HasHi;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
+ if (N->getValueType(0).getVectorNumElements() != 4)
+ return false;
+
+ // Expect 0, 0, 2, 2
+ for (unsigned i = 0; i < 2; ++i)
+ if (N->getMaskElt(i) > 0)
+ return false;
+
+ bool HasHi = false;
+ for (unsigned i = 2; i < 4; ++i) {
+ int Elt = N->getMaskElt(i);
+ if (Elt >= 0 && Elt != 2)
+ return false;
+ if (Elt == 2)
+ HasHi = true;
+ }
+ // Don't use movsldup if it can be done with a shufps.
+ return HasHi;
+}
+
+/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
+ int e = N->getValueType(0).getVectorNumElements() / 2;
+
+ for (int i = 0; i < e; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), i))
+ return false;
+ for (int i = 0; i < e; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(e+i), i))
+ return false;
+ return true;
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+/// instructions.
+unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ int NumOperands = SVOp->getValueType(0).getVectorNumElements();
+
+ unsigned Shift = (NumOperands == 4) ? 2 : 1;
+ unsigned Mask = 0;
+ for (int i = 0; i < NumOperands; ++i) {
+ int Val = SVOp->getMaskElt(NumOperands-i-1);
+ if (Val < 0) Val = 0;
+ if (Val >= NumOperands) Val -= NumOperands;
+ Mask |= Val;
+ if (i != NumOperands - 1)
+ Mask <<= Shift;
+ }
+ return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+/// instructions.
+unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ unsigned Mask = 0;
+ // 8 nodes, but we only care about the last 4.
+ for (unsigned i = 7; i >= 4; --i) {
+ int Val = SVOp->getMaskElt(i);
+ if (Val >= 0)
+ Mask |= (Val - 4);
+ if (i != 4)
+ Mask <<= 2;
+ }
+ return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+/// instructions.
+unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ unsigned Mask = 0;
+ // 8 nodes, but we only care about the first 4.
+ for (int i = 3; i >= 0; --i) {
+ int Val = SVOp->getMaskElt(i);
+ if (Val >= 0)
+ Mask |= Val;
+ if (i != 0)
+ Mask <<= 2;
+ }
+ return Mask;
+}
+
+/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
+/// their permute mask.
+static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG) {
+ MVT VT = SVOp->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> MaskVec;
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int idx = SVOp->getMaskElt(i);
+ if (idx < 0)
+ MaskVec.push_back(idx);
+ else if (idx < (int)NumElems)
+ MaskVec.push_back(idx + NumElems);
+ else
+ MaskVec.push_back(idx - NumElems);
+ }
+ return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
+ SVOp->getOperand(0), &MaskVec[0]);
+}
+
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, MVT VT) {
+ unsigned NumElems = VT.getVectorNumElements();
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int idx = Mask[i];
+ if (idx < 0)
+ continue;
+ else if (idx < (int)NumElems)
+ Mask[i] = idx + NumElems;
+ else
+ Mask[i] = idx - NumElems;
+ }
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
+ if (Op->getValueType(0).getVectorNumElements() != 4)
+ return false;
+ for (unsigned i = 0, e = 2; i != e; ++i)
+ if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
+ return false;
+ for (unsigned i = 2; i != 4; ++i)
+ if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
+ return false;
+ return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector. It also returns the LoadSDNode by reference if
+/// required.
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+ if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
+ return false;
+ N = N->getOperand(0).getNode();
+ if (!ISD::isNON_EXTLoad(N))
+ return false;
+ if (LD)
+ *LD = cast<LoadSDNode>(N);
+ return true;
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
+ ShuffleVectorSDNode *Op) {
+ if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+ return false;
+ // Is V2 is a vector load, don't do this transformation. We will try to use
+ // load folding shufps op.
+ if (ISD::isNON_EXTLoad(V2))
+ return false;
+
+ unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+ if (!isUndefOrEqual(Op->getMaskElt(i), i))
+ return false;
+ for (unsigned i = NumElems/2; i != NumElems; ++i)
+ if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
+ return false;
+ return true;
+}
+
+/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
+/// all the same.
+static bool isSplatVector(SDNode *N) {
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ SDValue SplatValue = N->getOperand(0);
+ for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+ if (N->getOperand(i) != SplatValue)
+ return false;
+ return true;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+static inline bool isZeroNode(SDValue Elt) {
+ return ((isa<ConstantSDNode>(Elt) &&
+ cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
+ (isa<ConstantFPSDNode>(Elt) &&
+ cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector.
+/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
+static bool isZeroShuffle(ShuffleVectorSDNode *N) {
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+ unsigned NumElems = N->getValueType(0).getVectorNumElements();
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int Idx = N->getMaskElt(i);
+ if (Idx >= (int)NumElems) {
+ unsigned Opc = V2.getOpcode();
+ if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
+ continue;
+ if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V2.getOperand(Idx-NumElems)))
+ return false;
+ } else if (Idx >= 0) {
+ unsigned Opc = V1.getOpcode();
+ if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
+ continue;
+ if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V1.getOperand(Idx)))
+ return false;
+ }
+ }
+ return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG,
+ DebugLoc dl) {
+ assert(VT.isVector() && "Expected a vector type");
+
+ // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+ // type. This ensures they get CSE'd.
+ SDValue Vec;
+ if (VT.getSizeInBits() == 64) { // MMX
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+ } else if (HasSSE2) { // SSE2
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ } else { // SSE1
+ SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+ }
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+///
+static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
+ assert(VT.isVector() && "Expected a vector type");
+
+ // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+ // type. This ensures they get CSE'd.
+ SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+ SDValue Vec;
+ if (VT.getSizeInBits() == 64) // MMX
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+ else // SSE
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+ MVT VT = SVOp->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ bool Changed = false;
+ SmallVector<int, 8> MaskVec;
+ SVOp->getMask(MaskVec);
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ if (MaskVec[i] > (int)NumElems) {
+ MaskVec[i] = NumElems;
+ Changed = true;
+ }
+ }
+ if (Changed)
+ return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
+ SVOp->getOperand(1), &MaskVec[0]);
+ return SDValue(SVOp, 0);
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ Mask.push_back(NumElems);
+ for (unsigned i = 1; i != NumElems; ++i)
+ Mask.push_back(i);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+ Mask.push_back(i);
+ Mask.push_back(i + NumElems);
+ }
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned Half = NumElems/2;
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0; i != Half; ++i) {
+ Mask.push_back(i + Half);
+ Mask.push_back(i + NumElems + Half);
+ }
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
+ bool HasSSE2) {
+ if (SV->getValueType(0).getVectorNumElements() <= 4)
+ return SDValue(SV, 0);
+
+ MVT PVT = MVT::v4f32;
+ MVT VT = SV->getValueType(0);
+ DebugLoc dl = SV->getDebugLoc();
+ SDValue V1 = SV->getOperand(0);
+ int NumElems = VT.getVectorNumElements();
+ int EltNo = SV->getSplatIndex();
+
+ // unpack elements to the correct location
+ while (NumElems > 4) {
+ if (EltNo < NumElems/2) {
+ V1 = getUnpackl(DAG, dl, VT, V1, V1);
+ } else {
+ V1 = getUnpackh(DAG, dl, VT, V1, V1);
+ EltNo -= NumElems/2;
+ }
+ NumElems >>= 1;
+ }
+
+ // Perform the splat.
+ int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+ V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
+ V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector. This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+ bool isZero, bool HasSSE2,
+ SelectionDAG &DAG) {
+ MVT VT = V2.getValueType();
+ SDValue V1 = isZero
+ ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 16> MaskVec;
+ for (unsigned i = 0; i != NumElems; ++i)
+ // If this is the insertion idx, put the low elt of V2 here.
+ MaskVec.push_back(i == Idx ? NumElems : i);
+ return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
+}
+
+/// getNumOfConsecutiveZeros - Return the number of elements in a result of
+/// a shuffle that is zero.
+static
+unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
+ bool Low, SelectionDAG &DAG) {
+ unsigned NumZeros = 0;
+ for (int i = 0; i < NumElems; ++i) {
+ unsigned Index = Low ? i : NumElems-i-1;
+ int Idx = SVOp->getMaskElt(Index);
+ if (Idx < 0) {
+ ++NumZeros;
+ continue;
+ }
+ SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
+ if (Elt.getNode() && isZeroNode(Elt))
+ ++NumZeros;
+ else
+ break;
+ }
+ return NumZeros;
+}
+
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+/// FIXME: split into pslldqi, psrldqi, palignr variants.
+static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+ bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+ int NumElems = SVOp->getValueType(0).getVectorNumElements();
+
+ isLeft = true;
+ unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
+ if (!NumZeros) {
+ isLeft = false;
+ NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
+ if (!NumZeros)
+ return false;
+ }
+ bool SeenV1 = false;
+ bool SeenV2 = false;
+ for (int i = NumZeros; i < NumElems; ++i) {
+ int Val = isLeft ? (i - NumZeros) : i;
+ int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
+ if (Idx < 0)
+ continue;
+ if (Idx < NumElems)
+ SeenV1 = true;
+ else {
+ Idx -= NumElems;
+ SeenV2 = true;
+ }
+ if (Idx != Val)
+ return false;
+ }
+ if (SeenV1 && SeenV2)
+ return false;
+
+ ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
+ ShAmt = NumZeros;
+ return true;
+}
+
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG, TargetLowering &TLI) {
+ if (NumNonZero > 8)
+ return SDValue();
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue V(0, 0);
+ bool First = true;
+ for (unsigned i = 0; i < 16; ++i) {
+ bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+ if (ThisIsNonZero && First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, true, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+
+ if ((i & 1) != 0) {
+ SDValue ThisElt(0, 0), LastElt(0, 0);
+ bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+ if (LastIsNonZero) {
+ LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+ MVT::i16, Op.getOperand(i-1));
+ }
+ if (ThisIsNonZero) {
+ ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+ ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+ ThisElt, DAG.getConstant(8, MVT::i8));
+ if (LastIsNonZero)
+ ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+ } else
+ ThisElt = LastElt;
+
+ if (ThisElt.getNode())
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+ DAG.getIntPtrConstant(i/2));
+ }
+ }
+
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG, TargetLowering &TLI) {
+ if (NumNonZero > 4)
+ return SDValue();
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue V(0, 0);
+ bool First = true;
+ for (unsigned i = 0; i < 8; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, true, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v8i16, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i));
+ }
+ }
+
+ return V;
+}
+
+/// getVShift - Return a vector logical shift node.
+///
+static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp,
+ unsigned NumBits, SelectionDAG &DAG,
+ const TargetLowering &TLI, DebugLoc dl) {
+ bool isMMX = VT.getSizeInBits() == 64;
+ MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
+ unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
+ SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getNode(Opc, dl, ShVT, SrcOp,
+ DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ // All zero's are handled with pxor, all one's are handled with pcmpeqd.
+ if (ISD::isBuildVectorAllZeros(Op.getNode())
+ || ISD::isBuildVectorAllOnes(Op.getNode())) {
+ // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
+ // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
+ // eliminated on x86-32 hosts.
+ if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
+ return Op;
+
+ if (ISD::isBuildVectorAllOnes(Op.getNode()))
+ return getOnesVector(Op.getValueType(), DAG, dl);
+ return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
+ }
+
+ MVT VT = Op.getValueType();
+ MVT EVT = VT.getVectorElementType();
+ unsigned EVTBits = EVT.getSizeInBits();
+
+ unsigned NumElems = Op.getNumOperands();
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ unsigned NonZeros = 0;
+ bool IsAllConstants = true;
+ SmallSet<SDValue, 8> Values;
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ if (Elt.getOpcode() == ISD::UNDEF)
+ continue;
+ Values.insert(Elt);
+ if (Elt.getOpcode() != ISD::Constant &&
+ Elt.getOpcode() != ISD::ConstantFP)
+ IsAllConstants = false;
+ if (isZeroNode(Elt))
+ NumZero++;
+ else {
+ NonZeros |= (1 << i);
+ NumNonZero++;
+ }
+ }
+
+ if (NumNonZero == 0) {
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ return DAG.getUNDEF(VT);
+ }
+
+ // Special case for single non-zero, non-undef, element.
+ if (NumNonZero == 1 && NumElems <= 4) {
+ unsigned Idx = CountTrailingZeros_32(NonZeros);
+ SDValue Item = Op.getOperand(Idx);
+
+ // If this is an insertion of an i64 value on x86-32, and if the top bits of
+ // the value are obviously zero, truncate the value to i32 and do the
+ // insertion that way. Only do this if the value is non-constant or if the
+ // value is a constant being inserted into element 0. It is cheaper to do
+ // a constant pool load than it is to do a movd + shuffle.
+ if (EVT == MVT::i64 && !Subtarget->is64Bit() &&
+ (!IsAllConstants || Idx == 0)) {
+ if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+ // Handle MMX and SSE both.
+ MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
+ unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
+
+ // Truncate the value (which may itself be a constant) to i32, and
+ // convert it to a vector with movd (S2V+shuffle to zero extend).
+ Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true,
+ Subtarget->hasSSE2(), DAG);
+
+ // Now we have our 32-bit value zero extended in the low element of
+ // a vector. If Idx != 0, swizzle it into place.
+ if (Idx != 0) {
+ SmallVector<int, 4> Mask;
+ Mask.push_back(Idx);
+ for (unsigned i = 1; i != VecElts; ++i)
+ Mask.push_back(i);
+ Item = DAG.getVectorShuffle(VecVT, dl, Item,
+ DAG.getUNDEF(Item.getValueType()),
+ &Mask[0]);
+ }
+ return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
+ }
+ }
+
+ // If we have a constant or non-constant insertion into the low element of
+ // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+ // the rest of the elements. This will be matched as movd/movq/movss/movsd
+ // depending on what the source datatype is. Because we can only get here
+ // when NumElems <= 4, this only needs to handle i32/f32/i64/f64.
+ if (Idx == 0 &&
+ // Don't do this for i64 values on x86-32.
+ (EVT != MVT::i64 || Subtarget->is64Bit())) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+ return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
+ Subtarget->hasSSE2(), DAG);
+ }
+
+ // Is it a vector logical left shift?
+ if (NumElems == 2 && Idx == 1 &&
+ isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
+ unsigned NumBits = VT.getSizeInBits();
+ return getVShift(true, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ VT, Op.getOperand(1)),
+ NumBits/2, DAG, *this, dl);
+ }
+
+ if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+ return SDValue();
+
+ // Otherwise, if this is a vector with i32 or f32 elements, and the element
+ // is a non-constant being inserted into an element other than the low one,
+ // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
+ // movd/movss) to move this into the low element, then shuffle it into
+ // place.
+ if (EVTBits == 32) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+ // Turn it into a shuffle of zero and zero-extended scalar to vector.
+ Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
+ Subtarget->hasSSE2(), DAG);
+ SmallVector<int, 8> MaskVec;
+ for (unsigned i = 0; i < NumElems; i++)
+ MaskVec.push_back(i == Idx ? 0 : 1);
+ return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+ }
+ }
+
+ // Splat is obviously ok. Let legalizer expand it to a shuffle.
+ if (Values.size() == 1)
+ return SDValue();
+
+ // A vector full of immediates; various special cases are already
+ // handled, so this is best done with a single constant-pool load.
+ if (IsAllConstants)
+ return SDValue();
+
+ // Let legalizer expand 2-wide build_vectors.
+ if (EVTBits == 64) {
+ if (NumNonZero == 1) {
+ // One half is zero or undef.
+ unsigned Idx = CountTrailingZeros_32(NonZeros);
+ SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+ Op.getOperand(Idx));
+ return getShuffleVectorZeroOrUndef(V2, Idx, true,
+ Subtarget->hasSSE2(), DAG);
+ }
+ return SDValue();
+ }
+
+ // If element VT is < 32 bits, convert it to inserts into a zero vector.
+ if (EVTBits == 8 && NumElems == 16) {
+ SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+ *this);
+ if (V.getNode()) return V;
+ }
+
+ if (EVTBits == 16 && NumElems == 8) {
+ SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+ *this);
+ if (V.getNode()) return V;
+ }
+
+ // If element VT is == 32 bits, turn it into a number of shuffles.
+ SmallVector<SDValue, 8> V;
+ V.resize(NumElems);
+ if (NumElems == 4 && NumZero > 0) {
+ for (unsigned i = 0; i < 4; ++i) {
+ bool isZero = !(NonZeros & (1 << i));
+ if (isZero)
+ V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+ else
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ }
+
+ for (unsigned i = 0; i < 2; ++i) {
+ switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+ default: break;
+ case 0:
+ V[i] = V[i*2]; // Must be a zero vector.
+ break;
+ case 1:
+ V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+ break;
+ case 2:
+ V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+ break;
+ case 3:
+ V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+ break;
+ }
+ }
+
+ SmallVector<int, 8> MaskVec;
+ bool Reverse = (NonZeros & 0x3) == 2;
+ for (unsigned i = 0; i < 2; ++i)
+ MaskVec.push_back(Reverse ? 1-i : i);
+ Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+ for (unsigned i = 0; i < 2; ++i)
+ MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
+ return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+ }
+
+ if (Values.size() > 2) {
+ // If we have SSE 4.1, Expand into a number of inserts unless the number of
+ // values to be inserted is equal to the number of elements, in which case
+ // use the unpack code below in the hopes of matching the consecutive elts
+ // load merge pattern for shuffles.
+ // FIXME: We could probably just check that here directly.
+ if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
+ getSubtarget()->hasSSE41()) {
+ V[0] = DAG.getUNDEF(VT);
+ for (unsigned i = 0; i < NumElems; ++i)
+ if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+ V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
+ Op.getOperand(i), DAG.getIntPtrConstant(i));
+ return V[0];
+ }
+ // Expand into a number of unpckl*.
+ // e.g. for v4f32
+ // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+ // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+ // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
+ for (unsigned i = 0; i < NumElems; ++i)
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ NumElems >>= 1;
+ while (NumElems != 0) {
+ for (unsigned i = 0; i < NumElems; ++i)
+ V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
+ NumElems >>= 1;
+ }
+ return V[0];
+ }
+
+ return SDValue();
+}
+
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all] pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
+static
+SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG, X86TargetLowering &TLI) {
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ SmallVector<int, 8> MaskVals;
+
+ // Determine if more than 1 of the words in each of the low and high quadwords
+ // of the result come from the same quadword of one of the two inputs. Undef
+ // mask values count as coming from any quadword, for better codegen.
+ SmallVector<unsigned, 4> LoQuad(4);
+ SmallVector<unsigned, 4> HiQuad(4);
+ BitVector InputQuads(4);
+ for (unsigned i = 0; i < 8; ++i) {
+ SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
+ int EltIdx = SVOp->getMaskElt(i);
+ MaskVals.push_back(EltIdx);
+ if (EltIdx < 0) {
+ ++Quad[0];
+ ++Quad[1];
+ ++Quad[2];
+ ++Quad[3];
+ continue;
+ }
+ ++Quad[EltIdx / 4];
+ InputQuads.set(EltIdx / 4);
+ }
+
+ int BestLoQuad = -1;
+ unsigned MaxQuad = 1;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (LoQuad[i] > MaxQuad) {
+ BestLoQuad = i;
+ MaxQuad = LoQuad[i];
+ }
+ }
+
+ int BestHiQuad = -1;
+ MaxQuad = 1;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (HiQuad[i] > MaxQuad) {
+ BestHiQuad = i;
+ MaxQuad = HiQuad[i];
+ }
+ }
+
+ // For SSSE3, If all 8 words of the result come from only 1 quadword of each
+ // of the two input vectors, shuffle them into one input vector so only a
+ // single pshufb instruction is necessary. If There are more than 2 input
+ // quads, disable the next transformation since it does not help SSSE3.
+ bool V1Used = InputQuads[0] || InputQuads[1];
+ bool V2Used = InputQuads[2] || InputQuads[3];
+ if (TLI.getSubtarget()->hasSSSE3()) {
+ if (InputQuads.count() == 2 && V1Used && V2Used) {
+ BestLoQuad = InputQuads.find_first();
+ BestHiQuad = InputQuads.find_next(BestLoQuad);
+ }
+ if (InputQuads.count() > 2) {
+ BestLoQuad = -1;
+ BestHiQuad = -1;
+ }
+ }
+
+ // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
+ // the shuffle mask. If a quad is scored as -1, that means that it contains
+ // words from all 4 input quadwords.
+ SDValue NewV;
+ if (BestLoQuad >= 0 || BestHiQuad >= 0) {
+ SmallVector<int, 8> MaskV;
+ MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
+ MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
+ NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
+ NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
+
+ // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
+ // source words for the shuffle, to aid later transformations.
+ bool AllWordsInNewV = true;
+ bool InOrder[2] = { true, true };
+ for (unsigned i = 0; i != 8; ++i) {
+ int idx = MaskVals[i];
+ if (idx != (int)i)
+ InOrder[i/4] = false;
+ if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
+ continue;
+ AllWordsInNewV = false;
+ break;
+ }
+
+ bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
+ if (AllWordsInNewV) {
+ for (int i = 0; i != 8; ++i) {
+ int idx = MaskVals[i];
+ if (idx < 0)
+ continue;
+ idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
+ if ((idx != i) && idx < 4)
+ pshufhw = false;
+ if ((idx != i) && idx > 3)
+ pshuflw = false;
+ }
+ V1 = NewV;
+ V2Used = false;
+ BestLoQuad = 0;
+ BestHiQuad = 1;
+ }
+
+ // If we've eliminated the use of V2, and the new mask is a pshuflw or
+ // pshufhw, that's as cheap as it gets. Return the new shuffle.
+ if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
+ return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
+ DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
+ }
+ }
+
+ // If we have SSSE3, and all words of the result are from 1 input vector,
+ // case 2 is generated, otherwise case 3 is generated. If no SSSE3
+ // is present, fall back to case 4.
+ if (TLI.getSubtarget()->hasSSSE3()) {
+ SmallVector<SDValue,16> pshufbMask;
+
+ // If we have elements from both input vectors, set the high bit of the
+ // shuffle mask element to zero out elements that come from V2 in the V1
+ // mask, and elements that come from V1 in the V2 mask, so that the two
+ // results can be OR'd together.
+ bool TwoInputs = V1Used && V2Used;
+ for (unsigned i = 0; i != 8; ++i) {
+ int EltIdx = MaskVals[i] * 2;
+ if (TwoInputs && (EltIdx >= 16)) {
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ continue;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+ }
+ V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
+ V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ if (!TwoInputs)
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+
+ // Calculate the shuffle mask for the second input, shuffle it, and
+ // OR it with the first shuffled input.
+ pshufbMask.clear();
+ for (unsigned i = 0; i != 8; ++i) {
+ int EltIdx = MaskVals[i] * 2;
+ if (EltIdx < 16) {
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ continue;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
+ }
+ V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
+ V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+ }
+
+ // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
+ // and update MaskVals with new element order.
+ BitVector InOrder(8);
+ if (BestLoQuad >= 0) {
+ SmallVector<int, 8> MaskV;
+ for (int i = 0; i != 4; ++i) {
+ int idx = MaskVals[i];
+ if (idx < 0) {
+ MaskV.push_back(-1);
+ InOrder.set(i);
+ } else if ((idx / 4) == BestLoQuad) {
+ MaskV.push_back(idx & 3);
+ InOrder.set(i);
+ } else {
+ MaskV.push_back(-1);
+ }
+ }
+ for (unsigned i = 4; i != 8; ++i)
+ MaskV.push_back(i);
+ NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+ &MaskV[0]);
+ }
+
+ // If BestHi >= 0, generate a pshufhw to put the high elements in order,
+ // and update MaskVals with the new element order.
+ if (BestHiQuad >= 0) {
+ SmallVector<int, 8> MaskV;
+ for (unsigned i = 0; i != 4; ++i)
+ MaskV.push_back(i);
+ for (unsigned i = 4; i != 8; ++i) {
+ int idx = MaskVals[i];
+ if (idx < 0) {
+ MaskV.push_back(-1);
+ InOrder.set(i);
+ } else if ((idx / 4) == BestHiQuad) {
+ MaskV.push_back((idx & 3) + 4);
+ InOrder.set(i);
+ } else {
+ MaskV.push_back(-1);
+ }
+ }
+ NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+ &MaskV[0]);
+ }
+
+ // In case BestHi & BestLo were both -1, which means each quadword has a word
+ // from each of the four input quadwords, calculate the InOrder bitvector now
+ // before falling through to the insert/extract cleanup.
+ if (BestLoQuad == -1 && BestHiQuad == -1) {
+ NewV = V1;
+ for (int i = 0; i != 8; ++i)
+ if (MaskVals[i] < 0 || MaskVals[i] == i)
+ InOrder.set(i);
+ }
+
+ // The other elements are put in the right place using pextrw and pinsrw.
+ for (unsigned i = 0; i != 8; ++i) {
+ if (InOrder[i])
+ continue;
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0)
+ continue;
+ SDValue ExtOp = (EltIdx < 8)
+ ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+ DAG.getIntPtrConstant(EltIdx))
+ : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+ DAG.getIntPtrConstant(EltIdx - 8));
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
+ DAG.getIntPtrConstant(i));
+ }
+ return NewV;
+}
+
+// v16i8 shuffles - Prefer shuffles in the following order:
+// 1. [ssse3] 1 x pshufb
+// 2. [ssse3] 2 x pshufb + 1 x por
+// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
+static
+SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG, X86TargetLowering &TLI) {
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ SmallVector<int, 16> MaskVals;
+ SVOp->getMask(MaskVals);
+
+ // If we have SSSE3, case 1 is generated when all result bytes come from
+ // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
+ // present, fall back to case 3.
+ // FIXME: kill V2Only once shuffles are canonizalized by getNode.
+ bool V1Only = true;
+ bool V2Only = true;
+ for (unsigned i = 0; i < 16; ++i) {
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0)
+ continue;
+ if (EltIdx < 16)
+ V2Only = false;
+ else
+ V1Only = false;
+ }
+
+ // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
+ if (TLI.getSubtarget()->hasSSSE3()) {
+ SmallVector<SDValue,16> pshufbMask;
+
+ // If all result elements are from one input vector, then only translate
+ // undef mask values to 0x80 (zero out result) in the pshufb mask.
+ //
+ // Otherwise, we have elements from both input vectors, and must zero out
+ // elements that come from V2 in the first mask, and V1 in the second mask
+ // so that we can OR them together.
+ bool TwoInputs = !(V1Only || V2Only);
+ for (unsigned i = 0; i != 16; ++i) {
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ continue;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+ }
+ // If all the elements are from V2, assign it to V1 and return after
+ // building the first pshufb.
+ if (V2Only)
+ V1 = V2;
+ V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ if (!TwoInputs)
+ return V1;
+
+ // Calculate the shuffle mask for the second input, shuffle it, and
+ // OR it with the first shuffled input.
+ pshufbMask.clear();
+ for (unsigned i = 0; i != 16; ++i) {
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 16) {
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ continue;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+ }
+ V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+ }
+
+ // No SSSE3 - Calculate in place words and then fix all out of place words
+ // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
+ // the 16 different words that comprise the two doublequadword input vectors.
+ V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+ V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
+ SDValue NewV = V2Only ? V2 : V1;
+ for (int i = 0; i != 8; ++i) {
+ int Elt0 = MaskVals[i*2];
+ int Elt1 = MaskVals[i*2+1];
+
+ // This word of the result is all undef, skip it.
+ if (Elt0 < 0 && Elt1 < 0)
+ continue;
+
+ // This word of the result is already in the correct place, skip it.
+ if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
+ continue;
+ if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+ continue;
+
+ SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
+ SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
+ SDValue InsElt;
+
+ // If Elt0 and Elt1 are defined, are consecutive, and can be load
+ // using a single extract together, load it and store it.
+ if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
+ InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+ DAG.getIntPtrConstant(Elt1 / 2));
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+ DAG.getIntPtrConstant(i));
+ continue;
+ }
+
+ // If Elt1 is defined, extract it from the appropriate source. If the
+ // source byte is not also odd, shift the extracted word left 8 bits
+ // otherwise clear the bottom 8 bits if we need to do an or.
+ if (Elt1 >= 0) {
+ InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+ DAG.getIntPtrConstant(Elt1 / 2));
+ if ((Elt1 & 1) == 0)
+ InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
+ DAG.getConstant(8, TLI.getShiftAmountTy()));
+ else if (Elt0 >= 0)
+ InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
+ DAG.getConstant(0xFF00, MVT::i16));
+ }
+ // If Elt0 is defined, extract it from the appropriate source. If the
+ // source byte is not also even, shift the extracted word right 8 bits. If
+ // Elt1 was also defined, OR the extracted values together before
+ // inserting them in the result.
+ if (Elt0 >= 0) {
+ SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+ Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
+ if ((Elt0 & 1) != 0)
+ InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
+ DAG.getConstant(8, TLI.getShiftAmountTy()));
+ else if (Elt1 >= 0)
+ InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
+ DAG.getConstant(0x00FF, MVT::i16));
+ InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
+ : InsElt0;
+ }
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+ DAG.getIntPtrConstant(i));
+ }
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
+}
+
+/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
+/// done when every pair / quad of shuffle mask elements point to elements in
+/// the right sequence. e.g.
+/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
+static
+SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG,
+ TargetLowering &TLI, DebugLoc dl) {
+ MVT VT = SVOp->getValueType(0);
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned NewWidth = (NumElems == 4) ? 2 : 4;
+ MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
+ MVT MaskEltVT = MaskVT.getVectorElementType();
+ MVT NewVT = MaskVT;
+ switch (VT.getSimpleVT()) {
+ default: assert(false && "Unexpected!");
+ case MVT::v4f32: NewVT = MVT::v2f64; break;
+ case MVT::v4i32: NewVT = MVT::v2i64; break;
+ case MVT::v8i16: NewVT = MVT::v4i32; break;
+ case MVT::v16i8: NewVT = MVT::v4i32; break;
+ }
+
+ if (NewWidth == 2) {
+ if (VT.isInteger())
+ NewVT = MVT::v2i64;
+ else
+ NewVT = MVT::v2f64;
+ }
+ int Scale = NumElems / NewWidth;
+ SmallVector<int, 8> MaskVec;
+ for (unsigned i = 0; i < NumElems; i += Scale) {
+ int StartIdx = -1;
+ for (int j = 0; j < Scale; ++j) {
+ int EltIdx = SVOp->getMaskElt(i+j);
+ if (EltIdx < 0)
+ continue;
+ if (StartIdx == -1)
+ StartIdx = EltIdx - (EltIdx % Scale);
+ if (EltIdx != StartIdx + j)
+ return SDValue();
+ }
+ if (StartIdx == -1)
+ MaskVec.push_back(-1);
+ else
+ MaskVec.push_back(StartIdx / Scale);
+ }
+
+ V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
+ V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
+ return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
+}
+
+/// getVZextMovL - Return a zero-extending vector move low node.
+///
+static SDValue getVZextMovL(MVT VT, MVT OpVT,
+ SDValue SrcOp, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget, DebugLoc dl) {
+ if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+ LoadSDNode *LD = NULL;
+ if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
+ LD = dyn_cast<LoadSDNode>(SrcOp);
+ if (!LD) {
+ // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+ // instead.
+ MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+ if ((EVT != MVT::i64 || Subtarget->is64Bit()) &&
+ SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
+ SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) {
+ // PR2108
+ OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ OpVT,
+ SrcOp.getOperand(0)
+ .getOperand(0))));
+ }
+ }
+ }
+
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+ DAG.getNode(ISD::BIT_CONVERT, dl,
+ OpVT, SrcOp)));
+}
+
+/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
+/// shuffles.
+static SDValue
+LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ MVT VT = SVOp->getValueType(0);
+
+ SmallVector<std::pair<int, int>, 8> Locs;
+ Locs.resize(4);
+ SmallVector<int, 8> Mask1(4U, -1);
+ SmallVector<int, 8> PermMask;
+ SVOp->getMask(PermMask);
+
+ unsigned NumHi = 0;
+ unsigned NumLo = 0;
+ for (unsigned i = 0; i != 4; ++i) {
+ int Idx = PermMask[i];
+ if (Idx < 0) {
+ Locs[i] = std::make_pair(-1, -1);
+ } else {
+ assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
+ if (Idx < 4) {
+ Locs[i] = std::make_pair(0, NumLo);
+ Mask1[NumLo] = Idx;
+ NumLo++;
+ } else {
+ Locs[i] = std::make_pair(1, NumHi);
+ if (2+NumHi < 4)
+ Mask1[2+NumHi] = Idx;
+ NumHi++;
+ }
+ }
+ }
+
+ if (NumLo <= 2 && NumHi <= 2) {
+ // If no more than two elements come from either vector. This can be
+ // implemented with two shuffles. First shuffle gather the elements.
+ // The second shuffle, which takes the first shuffle as both of its
+ // vector operands, put the elements into the right order.
+ V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+ SmallVector<int, 8> Mask2(4U, -1);
+
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Locs[i].first == -1)
+ continue;
+ else {
+ unsigned Idx = (i < 2) ? 0 : 4;
+ Idx += Locs[i].first * 2 + Locs[i].second;
+ Mask2[i] = Idx;
+ }
+ }
+
+ return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
+ } else if (NumLo == 3 || NumHi == 3) {
+ // Otherwise, we must have three elements from one vector, call it X, and
+ // one element from the other, call it Y. First, use a shufps to build an
+ // intermediate vector with the one element from Y and the element from X
+ // that will be in the same half in the final destination (the indexes don't
+ // matter). Then, use a shufps to build the final vector, taking the half
+ // containing the element from Y from the intermediate, and the other half
+ // from X.
+ if (NumHi == 3) {
+ // Normalize it so the 3 elements come from V1.
+ CommuteVectorShuffleMask(PermMask, VT);
+ std::swap(V1, V2);
+ }
+
+ // Find the element from V2.
+ unsigned HiIndex;
+ for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
+ int Val = PermMask[HiIndex];
+ if (Val < 0)
+ continue;
+ if (Val >= 4)
+ break;
+ }
+
+ Mask1[0] = PermMask[HiIndex];
+ Mask1[1] = -1;
+ Mask1[2] = PermMask[HiIndex^1];
+ Mask1[3] = -1;
+ V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+ if (HiIndex >= 2) {
+ Mask1[0] = PermMask[0];
+ Mask1[1] = PermMask[1];
+ Mask1[2] = HiIndex & 1 ? 6 : 4;
+ Mask1[3] = HiIndex & 1 ? 4 : 6;
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+ } else {
+ Mask1[0] = HiIndex & 1 ? 2 : 0;
+ Mask1[1] = HiIndex & 1 ? 0 : 2;
+ Mask1[2] = PermMask[2];
+ Mask1[3] = PermMask[3];
+ if (Mask1[2] >= 0)
+ Mask1[2] += 4;
+ if (Mask1[3] >= 0)
+ Mask1[3] += 4;
+ return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
+ }
+ }
+
+ // Break it into (shuffle shuffle_hi, shuffle_lo).
+ Locs.clear();
+ SmallVector<int,8> LoMask(4U, -1);
+ SmallVector<int,8> HiMask(4U, -1);
+
+ SmallVector<int,8> *MaskPtr = &LoMask;
+ unsigned MaskIdx = 0;
+ unsigned LoIdx = 0;
+ unsigned HiIdx = 2;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (i == 2) {
+ MaskPtr = &HiMask;
+ MaskIdx = 1;
+ LoIdx = 0;
+ HiIdx = 2;
+ }
+ int Idx = PermMask[i];
+ if (Idx < 0) {
+ Locs[i] = std::make_pair(-1, -1);
+ } else if (Idx < 4) {
+ Locs[i] = std::make_pair(MaskIdx, LoIdx);
+ (*MaskPtr)[LoIdx] = Idx;
+ LoIdx++;
+ } else {
+ Locs[i] = std::make_pair(MaskIdx, HiIdx);
+ (*MaskPtr)[HiIdx] = Idx;
+ HiIdx++;
+ }
+ }
+
+ SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
+ SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
+ SmallVector<int, 8> MaskOps;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Locs[i].first == -1) {
+ MaskOps.push_back(-1);
+ } else {
+ unsigned Idx = Locs[i].first * 4 + Locs[i].second;
+ MaskOps.push_back(Idx);
+ }
+ }
+ return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
+}
+
+SDValue
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ MVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned NumElems = VT.getVectorNumElements();
+ bool isMMX = VT.getSizeInBits() == 64;
+ bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ bool V1IsSplat = false;
+ bool V2IsSplat = false;
+
+ if (isZeroShuffle(SVOp))
+ return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+
+ // Promote splats to v4f32.
+ if (SVOp->isSplat()) {
+ if (isMMX || NumElems < 4)
+ return Op;
+ return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
+ }
+
+ // If the shuffle can be profitably rewritten as a narrower shuffle, then
+ // do it!
+ if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+ if (NewOp.getNode())
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ LowerVECTOR_SHUFFLE(NewOp, DAG));
+ } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+ // FIXME: Figure out a cleaner way to do this.
+ // Try to make use of movq to zero out the top part.
+ if (ISD::isBuildVectorAllZeros(V2.getNode())) {
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+ if (NewOp.getNode()) {
+ if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
+ return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
+ DAG, Subtarget, dl);
+ }
+ } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+ if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
+ return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
+ DAG, Subtarget, dl);
+ }
+ }
+
+ if (X86::isPSHUFDMask(SVOp))
+ return Op;
+
+ // Check if this can be converted into a logical shift.
+ bool isLeft = false;
+ unsigned ShAmt = 0;
+ SDValue ShVal;
+ bool isShift = getSubtarget()->hasSSE2() &&
+ isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+ if (isShift && ShVal.hasOneUse()) {
+ // If the shifted value has multiple uses, it may be cheaper to use
+ // v_set0 + movlhps or movhlps, etc.
+ MVT EVT = VT.getVectorElementType();
+ ShAmt *= EVT.getSizeInBits();
+ return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+ }
+
+ if (X86::isMOVLMask(SVOp)) {
+ if (V1IsUndef)
+ return V2;
+ if (ISD::isBuildVectorAllZeros(V1.getNode()))
+ return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
+ if (!isMMX)
+ return Op;
+ }
+
+ // FIXME: fold these into legal mask.
+ if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
+ X86::isMOVSLDUPMask(SVOp) ||
+ X86::isMOVHLPSMask(SVOp) ||
+ X86::isMOVHPMask(SVOp) ||
+ X86::isMOVLPMask(SVOp)))
+ return Op;
+
+ if (ShouldXformToMOVHLPS(SVOp) ||
+ ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
+ return CommuteVectorShuffle(SVOp, DAG);
+
+ if (isShift) {
+ // No better options. Use a vshl / vsrl.
+ MVT EVT = VT.getVectorElementType();
+ ShAmt *= EVT.getSizeInBits();
+ return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+ }
+
+ bool Commuted = false;
+ // FIXME: This should also accept a bitcast of a splat? Be careful, not
+ // 1,1,1,1 -> v8i16 though.
+ V1IsSplat = isSplatVector(V1.getNode());
+ V2IsSplat = isSplatVector(V2.getNode());
+
+ // Canonicalize the splat or undef, if present, to be on the RHS.
+ if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
+ Op = CommuteVectorShuffle(SVOp, DAG);
+ SVOp = cast<ShuffleVectorSDNode>(Op);
+ V1 = SVOp->getOperand(0);
+ V2 = SVOp->getOperand(1);
+ std::swap(V1IsSplat, V2IsSplat);
+ std::swap(V1IsUndef, V2IsUndef);
+ Commuted = true;
+ }
+
+ if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
+ // Shuffling low element of v1 into undef, just return v1.
+ if (V2IsUndef)
+ return V1;
+ // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
+ // the instruction selector will not match, so get a canonical MOVL with
+ // swapped operands to undo the commute.
+ return getMOVL(DAG, dl, VT, V2, V1);
+ }
+
+ if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
+ X86::isUNPCKH_v_undef_Mask(SVOp) ||
+ X86::isUNPCKLMask(SVOp) ||
+ X86::isUNPCKHMask(SVOp))
+ return Op;
+
+ if (V2IsSplat) {
+ // Normalize mask so all entries that point to V2 points to its first
+ // element then try to match unpck{h|l} again. If match, return a
+ // new vector_shuffle with the corrected mask.
+ SDValue NewMask = NormalizeMask(SVOp, DAG);
+ ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
+ if (NSVOp != SVOp) {
+ if (X86::isUNPCKLMask(NSVOp, true)) {
+ return NewMask;
+ } else if (X86::isUNPCKHMask(NSVOp, true)) {
+ return NewMask;
+ }
+ }
+ }
+
+ if (Commuted) {
+ // Commute is back and try unpck* again.
+ // FIXME: this seems wrong.
+ SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
+ ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
+ if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
+ X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
+ X86::isUNPCKLMask(NewSVOp) ||
+ X86::isUNPCKHMask(NewSVOp))
+ return NewOp;
+ }
+
+ // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
+
+ // Normalize the node to match x86 shuffle ops if needed
+ if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
+ return CommuteVectorShuffle(SVOp, DAG);
+
+ // Check for legal shuffle and return?
+ SmallVector<int, 16> PermMask;
+ SVOp->getMask(PermMask);
+ if (isShuffleMaskLegal(PermMask, VT))
+ return Op;
+
+ // Handle v8i16 specifically since SSE can do byte extraction and insertion.
+ if (VT == MVT::v8i16) {
+ SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
+ if (NewOp.getNode())
+ return NewOp;
+ }
+
+ if (VT == MVT::v16i8) {
+ SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
+ if (NewOp.getNode())
+ return NewOp;
+ }
+
+ // Handle all 4 wide cases with a number of shuffles except for MMX.
+ if (NumElems == 4 && !isMMX)
+ return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
+
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ if (VT.getSizeInBits() == 8) {
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ } else if (VT.getSizeInBits() == 16) {
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+ if (Idx == 0)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getNode(ISD::BIT_CONVERT, dl,
+ MVT::v4i32,
+ Op.getOperand(0)),
+ Op.getOperand(1)));
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ } else if (VT == MVT::f32) {
+ // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+ // the result back to FR32 register. It's only worth matching if the
+ // result has a single use which is a store or a bitcast to i32. And in
+ // the case of a store, it's not worth it if the index is a constant 0,
+ // because a MOVSSmr can be used instead, which is smaller and faster.
+ if (!Op.hasOneUse())
+ return SDValue();
+ SDNode *User = *Op.getNode()->use_begin();
+ if ((User->getOpcode() != ISD::STORE ||
+ (isa<ConstantSDNode>(Op.getOperand(1)) &&
+ cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+ (User->getOpcode() != ISD::BIT_CONVERT ||
+ User->getValueType(0) != MVT::i32))
+ return SDValue();
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
+ Op.getOperand(0)),
+ Op.getOperand(1));
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
+ } else if (VT == MVT::i32) {
+ // ExtractPS works with constant index.
+ if (isa<ConstantSDNode>(Op.getOperand(1)))
+ return Op;
+ }
+ return SDValue();
+}
+
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+ if (!isa<ConstantSDNode>(Op.getOperand(1)))
+ return SDValue();
+
+ if (Subtarget->hasSSE41()) {
+ SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
+ if (Res.getNode())
+ return Res;
+ }
+
+ MVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ // TODO: handle v16i8.
+ if (VT.getSizeInBits() == 16) {
+ SDValue Vec = Op.getOperand(0);
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getNode(ISD::BIT_CONVERT, dl,
+ MVT::v4i32, Vec),
+ Op.getOperand(1)));
+ // Transform it so it match pextrw which produces a 32-bit result.
+ MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1);
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EVT, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ } else if (VT.getSizeInBits() == 32) {
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return Op;
+
+ // SHUFPS the element to the lowest double word, then movss.
+ int Mask[4] = { Idx, -1, -1, -1 };
+ MVT VVT = Op.getOperand(0).getValueType();
+ SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+ DAG.getUNDEF(VVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0));
+ } else if (VT.getSizeInBits() == 64) {
+ // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+ // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+ // to match extract_elt for f64.
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return Op;
+
+ // UNPCKHPD the element to the lowest double word, then movsd.
+ // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+ // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+ int Mask[2] = { 1, -1 };
+ MVT VVT = Op.getOperand(0).getValueType();
+ SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+ DAG.getUNDEF(VVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0));
+ }
+
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
+ MVT VT = Op.getValueType();
+ MVT EVT = VT.getVectorElementType();
+ DebugLoc dl = Op.getDebugLoc();
+
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+
+ if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) &&
+ isa<ConstantSDNode>(N2)) {
+ unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB
+ : X86ISD::PINSRW;
+ // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+ // argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+ return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into these
+ // bits. For example (insert (extract, 3), 2) could be matched by putting
+ // the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // combine either bitwise AND or insert of float 0.0 to set these bits.
+ N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+ } else if (EVT == MVT::i32) {
+ // InsertPS works with constant index.
+ if (isa<ConstantSDNode>(N2))
+ return Op;
+ }
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getValueType();
+ MVT EVT = VT.getVectorElementType();
+
+ if (Subtarget->hasSSE41())
+ return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+
+ if (EVT == MVT::i8)
+ return SDValue();
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+
+ if (EVT.getSizeInBits() == 16) {
+ // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+ // as its second argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+ return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+ }
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ if (Op.getValueType() == MVT::v2f32)
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
+ Op.getOperand(0))));
+
+ SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+ MVT VT = MVT::v2i32;
+ switch (Op.getValueType().getSimpleVT()) {
+ default: break;
+ case MVT::v16i8:
+ case MVT::v8i16:
+ VT = MVT::v4i32;
+ break;
+ }
+ return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+ // FIXME there isn't really any debug info here, should come from the parent
+ DebugLoc dl = CP->getDebugLoc();
+ SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+ CP->getAlignment());
+ Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+ // With PIC, the address is actually $g + Offset.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->isPICStyleRIPRel()) {
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc::getUnknownLoc(),
+ getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+ int64_t Offset,
+ SelectionDAG &DAG) const {
+ bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+ bool ExtraLoadRequired =
+ Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false);
+
+ // Create the TargetGlobalAddress node, folding in the constant
+ // offset if it is legal.
+ SDValue Result;
+ if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) {
+ Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+ Offset = 0;
+ } else
+ Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0);
+ Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (IsPic && !Subtarget->isPICStyleRIPRel()) {
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+ Result);
+ }
+
+ // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
+ // load the value at address GV, not the value of GV itself. This means that
+ // the GlobalAddress must be in the base or index register of the address, not
+ // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
+ // The same applies for external symbols during PIC codegen
+ if (ExtraLoadRequired)
+ Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+ PseudoSourceValue::getGOT(), 0);
+
+ // If there was a non-zero offset that we didn't fold, create an explicit
+ // addition for it.
+ if (Offset != 0)
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+ DAG.getConstant(Offset, getPointerTy()));
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+ return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+ SDValue *InFlag, const MVT PtrVT, unsigned ReturnReg) {
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ DebugLoc dl = GA->getDebugLoc();
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+ GA->getValueType(0),
+ GA->getOffset());
+ if (InFlag) {
+ SDValue Ops[] = { Chain, TGA, *InFlag };
+ Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
+ } else {
+ SDValue Ops[] = { Chain, TGA };
+ Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
+ }
+ SDValue Flag = Chain.getValue(1);
+ return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const MVT PtrVT) {
+ SDValue InFlag;
+ DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc::getUnknownLoc(),
+ PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+
+ return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const MVT PtrVT) {
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
+// "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const MVT PtrVT, TLSModel::Model model,
+ bool is64Bit) {
+ DebugLoc dl = GA->getDebugLoc();
+ // Get the Thread Pointer
+ SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
+ DebugLoc::getUnknownLoc(), PtrVT,
+ DAG.getRegister(is64Bit? X86::FS : X86::GS,
+ MVT::i32));
+
+ SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
+ NULL, 0);
+
+ // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
+ // exec)
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+ GA->getValueType(0),
+ GA->getOffset());
+ SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
+
+ if (model == TLSModel::InitialExec)
+ Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+ PseudoSourceValue::getGOT(), 0);
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
+ // TODO: implement the "local dynamic" model
+ // TODO: implement the "initial exec"model for pic executables
+ assert(Subtarget->isTargetELF() &&
+ "TLS not implemented for non-ELF targets");
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ GlobalValue *GV = GA->getGlobal();
+ TLSModel::Model model =
+ getTLSModel (GV, getTargetMachine().getRelocationModel());
+ if (Subtarget->is64Bit()) {
+ switch (model) {
+ case TLSModel::GeneralDynamic:
+ case TLSModel::LocalDynamic: // not implemented
+ return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+
+ case TLSModel::InitialExec:
+ case TLSModel::LocalExec:
+ return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, true);
+ }
+ } else {
+ switch (model) {
+ case TLSModel::GeneralDynamic:
+ case TLSModel::LocalDynamic: // not implemented
+ return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+
+ case TLSModel::InitialExec:
+ case TLSModel::LocalExec:
+ return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, false);
+ }
+ }
+ assert(0 && "Unreachable");
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
+ // FIXME there isn't really any debug info here
+ DebugLoc dl = Op.getDebugLoc();
+ const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+ SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+ Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+ // With PIC, the address is actually $g + Offset.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->isPICStyleRIPRel()) {
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc::getUnknownLoc(),
+ getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+ // FIXME there isn't really any debug into here
+ DebugLoc dl = JT->getDebugLoc();
+ SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
+ Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+ // With PIC, the address is actually $g + Offset.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->isPICStyleRIPRel()) {
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc::getUnknownLoc(),
+ getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
+/// take a 2 x i32 value to shift plus a shift amount.
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ MVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+ bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ SDValue Tmp1 = isSRA ?
+ DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, MVT::i8)) :
+ DAG.getConstant(0, VT);
+
+ SDValue Tmp2, Tmp3;
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+ Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ } else {
+ Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+ Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+ }
+
+ SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits, MVT::i8));
+ SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
+ AndNode, DAG.getConstant(0, MVT::i8));
+
+ SDValue Hi, Lo;
+ SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+ SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ } else {
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ }
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+ MVT SrcVT = Op.getOperand(0).getValueType();
+ assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
+ "Unknown SINT_TO_FP to lower!");
+
+ // These are really Legal; return the operand so the caller accepts it as
+ // Legal.
+ if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+ return Op;
+ if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+ Subtarget->is64Bit()) {
+ return Op;
+ }
+
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned Size = SrcVT.getSizeInBits()/8;
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot,
+ PseudoSourceValue::getFixedStack(SSFI), 0);
+ return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain,
+ SDValue StackSlot,
+ SelectionDAG &DAG) {
+ // Build the FILD
+ DebugLoc dl = Op.getDebugLoc();
+ SDVTList Tys;
+ bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+ if (useSSE)
+ Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
+ else
+ Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(StackSlot);
+ Ops.push_back(DAG.getValueType(SrcVT));
+ SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
+ Tys, &Ops[0], Ops.size());
+
+ if (useSSE) {
+ Chain = Result.getValue(1);
+ SDValue InFlag = Result.getValue(2);
+
+ // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+ // shouldn't be necessary except that RFP cannot be live across
+ // multiple blocks. When stackifier is fixed, they can be uncoupled.
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ Tys = DAG.getVTList(MVT::Other);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Result);
+ Ops.push_back(StackSlot);
+ Ops.push_back(DAG.getValueType(Op.getValueType()));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size());
+ Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
+ PseudoSourceValue::getFixedStack(SSFI), 0);
+ }
+
+ return Result;
+}
+
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
+ // This algorithm is not obvious. Here it is in C code, more or less:
+ /*
+ double uint64_to_double( uint32_t hi, uint32_t lo ) {
+ static const __m128i exp = { 0x4330000045300000ULL, 0 };
+ static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
+
+ // Copy ints to xmm registers.
+ __m128i xh = _mm_cvtsi32_si128( hi );
+ __m128i xl = _mm_cvtsi32_si128( lo );
+
+ // Combine into low half of a single xmm register.
+ __m128i x = _mm_unpacklo_epi32( xh, xl );
+ __m128d d;
+ double sd;
+
+ // Merge in appropriate exponents to give the integer bits the right
+ // magnitude.
+ x = _mm_unpacklo_epi32( x, exp );
+
+ // Subtract away the biases to deal with the IEEE-754 double precision
+ // implicit 1.
+ d = _mm_sub_pd( (__m128d) x, bias );
+
+ // All conversions up to here are exact. The correctly rounded result is
+ // calculated using the current rounding mode using the following
+ // horizontal add.
+ d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
+ _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this
+ // store doesn't really need to be here (except
+ // maybe to zero the other double)
+ return sd;
+ }
+ */
+
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Build some magic constants.
+ std::vector<Constant*> CV0;
+ CV0.push_back(ConstantInt::get(APInt(32, 0x45300000)));
+ CV0.push_back(ConstantInt::get(APInt(32, 0x43300000)));
+ CV0.push_back(ConstantInt::get(APInt(32, 0)));
+ CV0.push_back(ConstantInt::get(APInt(32, 0)));
+ Constant *C0 = ConstantVector::get(CV0);
+ SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+
+ std::vector<Constant*> CV1;
+ CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL))));
+ CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL))));
+ Constant *C1 = ConstantVector::get(CV1);
+ SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+
+ SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+ DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(1)));
+ SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+ DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(0)));
+ SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
+ SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, 16);
+ SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
+ SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
+ SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, 16);
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+
+ // Add the halves; easiest way is to swap them into another reg first.
+ int ShufMask[2] = { 1, -1 };
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
+ DAG.getUNDEF(MVT::v2f64), ShufMask);
+ SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
+ DAG.getIntPtrConstant(0));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ // FP constant to bias correct the final result.
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+ MVT::f64);
+
+ // Load the 32-bit value into an XMM register.
+ SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+ DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(0)));
+
+ Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
+ DAG.getIntPtrConstant(0));
+
+ // Or the load with the bias.
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, Load)),
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, Bias)));
+ Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
+ DAG.getIntPtrConstant(0));
+
+ // Subtract the bias.
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+ // Handle final rounding.
+ MVT DestVT = Op.getValueType();
+
+ if (DestVT.bitsLT(MVT::f64)) {
+ return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+ DAG.getIntPtrConstant(0));
+ } else if (DestVT.bitsGT(MVT::f64)) {
+ return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+ }
+
+ // Handle final rounding.
+ return Sub;
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+ SDValue N0 = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't
+ // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+ // the optimization here.
+ if (DAG.SignBitIsZero(N0))
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+ MVT SrcVT = N0.getValueType();
+ if (SrcVT == MVT::i64) {
+ // We only handle SSE2 f64 target here; caller can expand the rest.
+ if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
+ return SDValue();
+
+ return LowerUINT_TO_FP_i64(Op, DAG);
+ } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) {
+ return LowerUINT_TO_FP_i32(Op, DAG);
+ }
+
+ assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!");
+
+ // Make a 64-bit buffer, and use it to build an FILD.
+ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+ SDValue WordOff = DAG.getConstant(4, getPointerTy());
+ SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
+ getPointerTy(), StackSlot, WordOff);
+ SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot, NULL, 0);
+ SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
+ OffsetSlot, NULL, 0);
+ return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+}
+
+std::pair<SDValue,SDValue> X86TargetLowering::
+FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
+ DebugLoc dl = Op.getDebugLoc();
+
+ MVT DstTy = Op.getValueType();
+
+ if (!IsSigned) {
+ assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+ DstTy = MVT::i64;
+ }
+
+ assert(DstTy.getSimpleVT() <= MVT::i64 &&
+ DstTy.getSimpleVT() >= MVT::i16 &&
+ "Unknown FP_TO_SINT to lower!");
+
+ // These are really Legal.
+ if (DstTy == MVT::i32 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+ if (Subtarget->is64Bit() &&
+ DstTy == MVT::i64 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+
+ // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
+ // stack slot.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned MemSize = DstTy.getSizeInBits()/8;
+ int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+ unsigned Opc;
+ switch (DstTy.getSimpleVT()) {
+ default: assert(0 && "Invalid FP_TO_SINT to lower!");
+ case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+ case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+ case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+ }
+
+ SDValue Chain = DAG.getEntryNode();
+ SDValue Value = Op.getOperand(0);
+ if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
+ assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+ Chain = DAG.getStore(Chain, dl, Value, StackSlot,
+ PseudoSourceValue::getFixedStack(SSFI), 0);
+ SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+ SDValue Ops[] = {
+ Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
+ };
+ Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
+ Chain = Value.getValue(1);
+ SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+ StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ }
+
+ // Build the FP_TO_INT*_IN_MEM
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
+
+ return std::make_pair(FIST, StackSlot);
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+ if (FIST.getNode() == 0) return Op;
+
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ FIST, StackSlot, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ assert(FIST.getNode() && "Unexpected failure");
+
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ FIST, StackSlot, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ MVT VT = Op.getValueType();
+ MVT EltVT = VT;
+ if (VT.isVector())
+ EltVT = VT.getVectorElementType();
+ std::vector<Constant*> CV;
+ if (EltVT == MVT::f64) {
+ Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))));
+ CV.push_back(C);
+ CV.push_back(C);
+ } else {
+ Constant *C = ConstantFP::get(APFloat(APInt(32, ~(1U << 31))));
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ }
+ Constant *C = ConstantVector::get(CV);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, 16);
+ return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
+}
+
+SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ MVT VT = Op.getValueType();
+ MVT EltVT = VT;
+ unsigned EltNum = 1;
+ if (VT.isVector()) {
+ EltVT = VT.getVectorElementType();
+ EltNum = VT.getVectorNumElements();
+ }
+ std::vector<Constant*> CV;
+ if (EltVT == MVT::f64) {
+ Constant *C = ConstantFP::get(APFloat(APInt(64, 1ULL << 63)));
+ CV.push_back(C);
+ CV.push_back(C);
+ } else {
+ Constant *C = ConstantFP::get(APFloat(APInt(32, 1U << 31)));
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ }
+ Constant *C = ConstantVector::get(CV);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, 16);
+ if (VT.isVector()) {
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getNode(ISD::XOR, dl, MVT::v2i64,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+ Op.getOperand(0)),
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
+ } else {
+ return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+ }
+}
+
+SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ DebugLoc dl = Op.getDebugLoc();
+ MVT VT = Op.getValueType();
+ MVT SrcVT = Op1.getValueType();
+
+ // If second operand is smaller, extend it first.
+ if (SrcVT.bitsLT(VT)) {
+ Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+ SrcVT = VT;
+ }
+ // And if it is bigger, shrink it first.
+ if (SrcVT.bitsGT(VT)) {
+ Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
+ SrcVT = VT;
+ }
+
+ // At this point the operands and the result should have the same
+ // type, and that won't be f80 since that is not custom lowered.
+
+ // First get the sign bit of second operand.
+ std::vector<Constant*> CV;
+ if (SrcVT == MVT::f64) {
+ CV.push_back(ConstantFP::get(APFloat(APInt(64, 1ULL << 63))));
+ CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
+ } else {
+ CV.push_back(ConstantFP::get(APFloat(APInt(32, 1U << 31))));
+ CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+ CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+ CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+ }
+ Constant *C = ConstantVector::get(CV);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, 16);
+ SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+
+ // Shift sign bit right or left if the two operands have different types.
+ if (SrcVT.bitsGT(VT)) {
+ // Op0 is MVT::f32, Op1 is MVT::f64.
+ SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
+ SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
+ DAG.getConstant(32, MVT::i32));
+ SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
+ SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
+ DAG.getIntPtrConstant(0));
+ }
+
+ // Clear first operand sign bit.
+ CV.clear();
+ if (VT == MVT::f64) {
+ CV.push_back(ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63)))));
+ CV.push_back(ConstantFP::get(APFloat(APInt(64, 0))));
+ } else {
+ CV.push_back(ConstantFP::get(APFloat(APInt(32, ~(1U << 31)))));
+ CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+ CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+ CV.push_back(ConstantFP::get(APFloat(APInt(32, 0))));
+ }
+ C = ConstantVector::get(CV);
+ CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, 16);
+ SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+
+ // Or the value with the sign bit.
+ return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+ SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+
+ // CF and OF aren't always set the way we want. Determine which
+ // of these we need.
+ bool NeedCF = false;
+ bool NeedOF = false;
+ switch (X86CC) {
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ NeedCF = true;
+ break;
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO:
+ NeedOF = true;
+ break;
+ default: break;
+ }
+
+ // See if we can use the EFLAGS value from the operand instead of
+ // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+ // we prove that the arithmetic won't overflow, we can't use OF or CF.
+ if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
+ unsigned Opcode = 0;
+ unsigned NumOperands = 0;
+ switch (Op.getNode()->getOpcode()) {
+ case ISD::ADD:
+ // Due to an isel shortcoming, be conservative if this add is likely to
+ // be selected as part of a load-modify-store instruction. When the root
+ // node in a match is a store, isel doesn't know how to remap non-chain
+ // non-flag uses of other nodes in the match, such as the ADD in this
+ // case. This leads to the ADD being left around and reselected, with
+ // the result being two adds in the output.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() == ISD::STORE)
+ goto default_case;
+ if (ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+ // An add of one will be selected as an INC.
+ if (C->getAPIntValue() == 1) {
+ Opcode = X86ISD::INC;
+ NumOperands = 1;
+ break;
+ }
+ // An add of negative one (subtract of one) will be selected as a DEC.
+ if (C->getAPIntValue().isAllOnesValue()) {
+ Opcode = X86ISD::DEC;
+ NumOperands = 1;
+ break;
+ }
+ }
+ // Otherwise use a regular EFLAGS-setting add.
+ Opcode = X86ISD::ADD;
+ NumOperands = 2;
+ break;
+ case ISD::SUB:
+ // Due to the ISEL shortcoming noted above, be conservative if this sub is
+ // likely to be selected as part of a load-modify-store instruction.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() == ISD::STORE)
+ goto default_case;
+ // Otherwise use a regular EFLAGS-setting sub.
+ Opcode = X86ISD::SUB;
+ NumOperands = 2;
+ break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ return SDValue(Op.getNode(), 1);
+ default:
+ default_case:
+ break;
+ }
+ if (Opcode != 0) {
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 0; i != NumOperands; ++i)
+ Ops.push_back(Op.getOperand(i));
+ SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+ DAG.ReplaceAllUsesWith(Op, New);
+ return SDValue(New.getNode(), 1);
+ }
+ }
+
+ // Otherwise just emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, Op.getValueType()));
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ SelectionDAG &DAG) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
+ if (C->getAPIntValue() == 0)
+ return EmitTest(Op0, X86CC, DAG);
+
+ DebugLoc dl = Op0.getDebugLoc();
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ DebugLoc dl = Op.getDebugLoc();
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+ // Lower (X & (1 << N)) == 0 to BT(X, N).
+ // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+ // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+ if (Op0.getOpcode() == ISD::AND &&
+ Op0.hasOneUse() &&
+ Op1.getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ SDValue LHS, RHS;
+ if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
+ if (ConstantSDNode *Op010C =
+ dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
+ if (Op010C->getZExtValue() == 1) {
+ LHS = Op0.getOperand(0);
+ RHS = Op0.getOperand(1).getOperand(1);
+ }
+ } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
+ if (ConstantSDNode *Op000C =
+ dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
+ if (Op000C->getZExtValue() == 1) {
+ LHS = Op0.getOperand(1);
+ RHS = Op0.getOperand(0).getOperand(1);
+ }
+ } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
+ ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
+ SDValue AndLHS = Op0.getOperand(0);
+ if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+ LHS = AndLHS.getOperand(0);
+ RHS = AndLHS.getOperand(1);
+ }
+ }
+
+ if (LHS.getNode()) {
+ // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT
+ // instruction. Since the shift amount is in-range-or-undefined, we know
+ // that doing a bittest on the i16 value is ok. We extend to i32 because
+ // the encoding for the i16 version is larger than the i32 version.
+ if (LHS.getValueType() == MVT::i8)
+ LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+ // If the operand types disagree, extend the shift amount to match. Since
+ // BT ignores high bits (like shifts) we can use anyextend.
+ if (LHS.getValueType() != RHS.getValueType())
+ RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+
+ SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+ unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(Cond, MVT::i8), BT);
+ }
+ }
+
+ bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
+ unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+
+ SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8), Cond);
+}
+
+SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
+ SDValue Cond;
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getValueType();
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
+ DebugLoc dl = Op.getDebugLoc();
+
+ if (isFP) {
+ unsigned SSECC = 8;
+ MVT VT0 = Op0.getValueType();
+ assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
+ unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
+ bool Swap = false;
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETOEQ:
+ case ISD::SETEQ: SSECC = 0; break;
+ case ISD::SETOGT:
+ case ISD::SETGT: Swap = true; // Fallthrough
+ case ISD::SETLT:
+ case ISD::SETOLT: SSECC = 1; break;
+ case ISD::SETOGE:
+ case ISD::SETGE: Swap = true; // Fallthrough
+ case ISD::SETLE:
+ case ISD::SETOLE: SSECC = 2; break;
+ case ISD::SETUO: SSECC = 3; break;
+ case ISD::SETUNE:
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETULE: Swap = true;
+ case ISD::SETUGE: SSECC = 5; break;
+ case ISD::SETULT: Swap = true;
+ case ISD::SETUGT: SSECC = 6; break;
+ case ISD::SETO: SSECC = 7; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // In the two special cases we can't handle, emit two comparisons.
+ if (SSECC == 8) {
+ if (SetCCOpcode == ISD::SETUEQ) {
+ SDValue UNORD, EQ;
+ UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
+ EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
+ return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
+ }
+ else if (SetCCOpcode == ISD::SETONE) {
+ SDValue ORD, NEQ;
+ ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
+ NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
+ }
+ assert(0 && "Illegal FP comparison");
+ }
+ // Handle all other FP comparisons here.
+ return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
+ }
+
+ // We are handling one of the integer comparisons here. Since SSE only has
+ // GT and EQ comparisons for integer, swapping operands and multiple
+ // operations may be required for some comparisons.
+ unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
+ bool Swap = false, Invert = false, FlipSigns = false;
+
+ switch (VT.getSimpleVT()) {
+ default: break;
+ case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
+ case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
+ case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
+ case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
+ }
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETNE: Invert = true;
+ case ISD::SETEQ: Opc = EQOpc; break;
+ case ISD::SETLT: Swap = true;
+ case ISD::SETGT: Opc = GTOpc; break;
+ case ISD::SETGE: Swap = true;
+ case ISD::SETLE: Opc = GTOpc; Invert = true; break;
+ case ISD::SETULT: Swap = true;
+ case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
+ case ISD::SETUGE: Swap = true;
+ case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations.
+ if (FlipSigns) {
+ MVT EltVT = VT.getVectorElementType();
+ SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
+ EltVT);
+ std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
+ SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
+ SignBits.size());
+ Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
+ Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
+ }
+
+ SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+ // If the logical-not of the result is required, perform that now.
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+
+ return Result;
+}
+
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+ unsigned Opc = Op.getNode()->getOpcode();
+ if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
+ return true;
+ if (Op.getResNo() == 1 &&
+ (Opc == X86ISD::ADD ||
+ Opc == X86ISD::SUB ||
+ Opc == X86ISD::SMUL ||
+ Opc == X86ISD::UMUL ||
+ Opc == X86ISD::INC ||
+ Opc == X86ISD::DEC))
+ return true;
+
+ return false;
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
+ bool addTest = true;
+ SDValue Cond = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue CC;
+
+ if (Cond.getOpcode() == ISD::SETCC)
+ Cond = LowerSETCC(Cond, DAG);
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ if (Cond.getOpcode() == X86ISD::SETCC) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ MVT VT = Op.getValueType();
+
+ bool IllegalFPCMov = false;
+ if (VT.isFloatingPoint() && !VT.isVector() &&
+ !isScalarFPTypeInSSEReg(VT)) // FPStack?
+ IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+ if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+ Opc == X86ISD::BT) { // FIXME
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ }
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+ SmallVector<SDValue, 4> Ops;
+ // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+ // condition is true.
+ Ops.push_back(Op.getOperand(2));
+ Ops.push_back(Op.getOperand(1));
+ Ops.push_back(CC);
+ Ops.push_back(Cond);
+ return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size());
+}
+
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+ Opc = Op.getOpcode();
+ if (Opc != ISD::OR && Opc != ISD::AND)
+ return false;
+ return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse() &&
+ Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(1).hasOneUse());
+}
+
+// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
+// 1 and that the SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+ if (Op.getOpcode() != ISD::XOR)
+ return false;
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (N1C && N1C->getAPIntValue() == 1) {
+ return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse();
+ }
+ return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
+ bool addTest = true;
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Dest = Op.getOperand(2);
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue CC;
+
+ if (Cond.getOpcode() == ISD::SETCC)
+ Cond = LowerSETCC(Cond, DAG);
+#if 0
+ // FIXME: LowerXALUO doesn't handle these!!
+ else if (Cond.getOpcode() == X86ISD::ADD ||
+ Cond.getOpcode() == X86ISD::SUB ||
+ Cond.getOpcode() == X86ISD::SMUL ||
+ Cond.getOpcode() == X86ISD::UMUL)
+ Cond = LowerXALUO(Cond, DAG);
+#endif
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ if (Cond.getOpcode() == X86ISD::SETCC) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+ if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+ Cond = Cmp;
+ addTest = false;
+ } else {
+ switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+ default: break;
+ case X86::COND_O:
+ case X86::COND_B:
+ // These can only come from an arithmetic instruction with overflow,
+ // e.g. SADDO, UADDO.
+ Cond = Cond.getNode()->getOperand(1);
+ addTest = false;
+ break;
+ }
+ }
+ } else {
+ unsigned CondOpc;
+ if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+ SDValue Cmp = Cond.getOperand(0).getOperand(1);
+ if (CondOpc == ISD::OR) {
+ // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+ // two branches instead of an explicit OR instruction with a
+ // separate test.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp)) {
+ CC = Cond.getOperand(0).getOperand(0);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = Cond.getOperand(1).getOperand(0);
+ Cond = Cmp;
+ addTest = false;
+ }
+ } else { // ISD::AND
+ // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp) &&
+ Op.getNode()->hasOneUse()) {
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, MVT::i8);
+ SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User.getOpcode() == ISD::BR) {
+ SDValue FalseBB = User.getOperand(1);
+ SDValue NewBR =
+ DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
+ assert(NewBR == User);
+ Dest = FalseBB;
+
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+ }
+ } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+ // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+ // It should be transformed during dag combiner except when the condition
+ // is set by a arithmetics with overflow node.
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, MVT::i8);
+ Cond = Cond.getOperand(0).getOperand(1);
+ addTest = false;
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ }
+ return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cond);
+}
+
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) {
+ assert(Subtarget->isTargetCygMing() &&
+ "This should be used only on Cygwin/Mingw targets");
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Get the inputs.
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ // FIXME: Ensure alignment here
+
+ SDValue Flag;
+
+ MVT IntPtr = getPointerTy();
+ MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+ Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
+ Flag = Chain.getValue(1);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue Ops[] = { Chain,
+ DAG.getTargetExternalSymbol("_alloca", IntPtr),
+ DAG.getRegister(X86::EAX, IntPtr),
+ DAG.getRegister(X86StackPtr, SPTy),
+ Flag };
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5);
+ Flag = Chain.getValue(1);
+
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(0, true),
+ DAG.getIntPtrConstant(0, true),
+ Flag);
+
+ Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+
+ SDValue Ops1[2] = { Chain.getValue(0), Chain };
+ return DAG.getMergeValues(Ops1, 2, dl);
+}
+
+SDValue
+X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ const Value *DstSV,
+ uint64_t DstSVOff) {
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+ // If not DWORD aligned or size is more than the threshold, call the library.
+ // The libc version is likely to be faster for these cases. It can use the
+ // address value and run time information about the CPU.
+ if ((Align & 3) != 0 ||
+ !ConstantSize ||
+ ConstantSize->getZExtValue() >
+ getSubtarget()->getMaxInlineSizeThreshold()) {
+ SDValue InFlag(0, 0);
+
+ // Check to see if there is a specialized entry-point for memory zeroing.
+ ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+ if (const char *bzeroEntry = V &&
+ V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+ MVT IntPtr = getPointerTy();
+ const Type *IntPtrTy = TD->getIntPtrType();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Dst;
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+ std::pair<SDValue,SDValue> CallResult =
+ LowerCallTo(Chain, Type::VoidTy, false, false, false, false,
+ CallingConv::C, false,
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
+ return CallResult.second;
+ }
+
+ // Otherwise have the target-independent code call memset.
+ return SDValue();
+ }
+
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ SDValue InFlag(0, 0);
+ MVT AVT;
+ SDValue Count;
+ ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+ unsigned BytesLeft = 0;
+ bool TwoRepStos = false;
+ if (ValC) {
+ unsigned ValReg;
+ uint64_t Val = ValC->getZExtValue() & 255;
+
+ // If the value is a constant, then we can potentially use larger sets.
+ switch (Align & 3) {
+ case 2: // WORD aligned
+ AVT = MVT::i16;
+ ValReg = X86::AX;
+ Val = (Val << 8) | Val;
+ break;
+ case 0: // DWORD aligned
+ AVT = MVT::i32;
+ ValReg = X86::EAX;
+ Val = (Val << 8) | Val;
+ Val = (Val << 16) | Val;
+ if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
+ AVT = MVT::i64;
+ ValReg = X86::RAX;
+ Val = (Val << 32) | Val;
+ }
+ break;
+ default: // Byte aligned
+ AVT = MVT::i8;
+ ValReg = X86::AL;
+ Count = DAG.getIntPtrConstant(SizeVal);
+ break;
+ }
+
+ if (AVT.bitsGT(MVT::i8)) {
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ Count = DAG.getIntPtrConstant(SizeVal / UBytes);
+ BytesLeft = SizeVal % UBytes;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ } else {
+ AVT = MVT::i8;
+ Count = DAG.getIntPtrConstant(SizeVal);
+ Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+ X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+ X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getValueType(AVT));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
+
+ if (TwoRepStos) {
+ InFlag = Chain.getValue(1);
+ Count = Size;
+ MVT CVT = Count.getValueType();
+ SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+ DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+ Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
+ X86::ECX,
+ Left, InFlag);
+ InFlag = Chain.getValue(1);
+ Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ Ops.clear();
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getValueType(MVT::i8));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size());
+ } else if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ MVT AddrVT = Dst.getValueType();
+ MVT SizeVT = Size.getValueType();
+
+ Chain = DAG.getMemset(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+ DAG.getConstant(Offset, AddrVT)),
+ Src,
+ DAG.getConstant(BytesLeft, SizeVT),
+ Align, DstSV, DstSVOff + Offset);
+ }
+
+ // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+ return Chain;
+}
+
+SDValue
+X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool AlwaysInline,
+ const Value *DstSV, uint64_t DstSVOff,
+ const Value *SrcSV, uint64_t SrcSVOff) {
+ // This requires the copy size to be a constant, preferrably
+ // within a subtarget-specific limit.
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (!ConstantSize)
+ return SDValue();
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
+ return SDValue();
+
+ /// If not DWORD aligned, call the library.
+ if ((Align & 3) != 0)
+ return SDValue();
+
+ // DWORD aligned
+ MVT AVT = MVT::i32;
+ if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned
+ AVT = MVT::i64;
+
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ unsigned CountVal = SizeVal / UBytes;
+ SDValue Count = DAG.getIntPtrConstant(CountVal);
+ unsigned BytesLeft = SizeVal % UBytes;
+
+ SDValue InFlag(0, 0);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+ X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+ X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
+ X86::ESI,
+ Src, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getValueType(AVT));
+ Ops.push_back(InFlag);
+ SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size());
+
+ SmallVector<SDValue, 4> Results;
+ Results.push_back(RepMovs);
+ if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ MVT DstVT = Dst.getValueType();
+ MVT SrcVT = Src.getValueType();
+ MVT SizeVT = Size.getValueType();
+ Results.push_back(DAG.getMemcpy(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+ DAG.getConstant(Offset, DstVT)),
+ DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+ DAG.getConstant(Offset, SrcVT)),
+ DAG.getConstant(BytesLeft, SizeVT),
+ Align, AlwaysInline,
+ DstSV, DstSVOff + Offset,
+ SrcSV, SrcSVOff + Offset));
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &Results[0], Results.size());
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ DebugLoc dl = Op.getDebugLoc();
+
+ if (!Subtarget->is64Bit()) {
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+ return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
+ }
+
+ // __va_list_tag:
+ // gp_offset (0 - 6 * 8)
+ // fp_offset (48 - 48 + 8 * 16)
+ // overflow_arg_area (point to parameters coming in memory).
+ // reg_save_area
+ SmallVector<SDValue, 8> MemOps;
+ SDValue FIN = Op.getOperand(1);
+ // Store gp_offset
+ SDValue Store = DAG.getStore(Op.getOperand(0), dl,
+ DAG.getConstant(VarArgsGPOffset, MVT::i32),
+ FIN, SV, 0);
+ MemOps.push_back(Store);
+
+ // Store fp_offset
+ FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ FIN, DAG.getIntPtrConstant(4));
+ Store = DAG.getStore(Op.getOperand(0), dl,
+ DAG.getConstant(VarArgsFPOffset, MVT::i32),
+ FIN, SV, 0);
+ MemOps.push_back(Store);
+
+ // Store ptr to overflow_arg_area
+ FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ FIN, DAG.getIntPtrConstant(4));
+ SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+ Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0);
+ MemOps.push_back(Store);
+
+ // Store ptr to reg_save_area.
+ FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ FIN, DAG.getIntPtrConstant(8));
+ SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+ Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0);
+ MemOps.push_back(Store);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOps[0], MemOps.size());
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+ assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
+ SDValue Chain = Op.getOperand(0);
+ SDValue SrcPtr = Op.getOperand(1);
+ SDValue SrcSV = Op.getOperand(2);
+
+ assert(0 && "VAArgInst is not yet implemented for x86-64!");
+ abort();
+ return SDValue();
+}
+
+SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+ assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+ SDValue Chain = Op.getOperand(0);
+ SDValue DstPtr = Op.getOperand(1);
+ SDValue SrcPtr = Op.getOperand(2);
+ const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+ const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ DebugLoc dl = Op.getDebugLoc();
+
+ return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
+ DAG.getIntPtrConstant(24), 8, false,
+ DstSV, 0, SrcSV, 0);
+}
+
+SDValue
+X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+ // Comparison intrinsics.
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ case Intrinsic::x86_sse2_comilt_sd:
+ case Intrinsic::x86_sse2_comile_sd:
+ case Intrinsic::x86_sse2_comigt_sd:
+ case Intrinsic::x86_sse2_comige_sd:
+ case Intrinsic::x86_sse2_comineq_sd:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ case Intrinsic::x86_sse2_ucomineq_sd: {
+ unsigned Opc = 0;
+ ISD::CondCode CC = ISD::SETCC_INVALID;
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETEQ;
+ break;
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse2_comilt_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETLT;
+ break;
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse2_comile_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETLE;
+ break;
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse2_comigt_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETGT;
+ break;
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse2_comige_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETGE;
+ break;
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse2_comineq_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETNE;
+ break;
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETEQ;
+ break;
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETLT;
+ break;
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETLE;
+ break;
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETGT;
+ break;
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETGE;
+ break;
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_ucomineq_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETNE;
+ break;
+ }
+
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+ SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8), Cond);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ // Fix vector shift instructions where the last operand is a non-immediate
+ // i32 value.
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_mmx_pslli_w:
+ case Intrinsic::x86_mmx_pslli_d:
+ case Intrinsic::x86_mmx_pslli_q:
+ case Intrinsic::x86_mmx_psrli_w:
+ case Intrinsic::x86_mmx_psrli_d:
+ case Intrinsic::x86_mmx_psrli_q:
+ case Intrinsic::x86_mmx_psrai_w:
+ case Intrinsic::x86_mmx_psrai_d: {
+ SDValue ShAmt = Op.getOperand(2);
+ if (isa<ConstantSDNode>(ShAmt))
+ return SDValue();
+
+ unsigned NewIntNo = 0;
+ MVT ShAmtVT = MVT::v4i32;
+ switch (IntNo) {
+ case Intrinsic::x86_sse2_pslli_w:
+ NewIntNo = Intrinsic::x86_sse2_psll_w;
+ break;
+ case Intrinsic::x86_sse2_pslli_d:
+ NewIntNo = Intrinsic::x86_sse2_psll_d;
+ break;
+ case Intrinsic::x86_sse2_pslli_q:
+ NewIntNo = Intrinsic::x86_sse2_psll_q;
+ break;
+ case Intrinsic::x86_sse2_psrli_w:
+ NewIntNo = Intrinsic::x86_sse2_psrl_w;
+ break;
+ case Intrinsic::x86_sse2_psrli_d:
+ NewIntNo = Intrinsic::x86_sse2_psrl_d;
+ break;
+ case Intrinsic::x86_sse2_psrli_q:
+ NewIntNo = Intrinsic::x86_sse2_psrl_q;
+ break;
+ case Intrinsic::x86_sse2_psrai_w:
+ NewIntNo = Intrinsic::x86_sse2_psra_w;
+ break;
+ case Intrinsic::x86_sse2_psrai_d:
+ NewIntNo = Intrinsic::x86_sse2_psra_d;
+ break;
+ default: {
+ ShAmtVT = MVT::v2i32;
+ switch (IntNo) {
+ case Intrinsic::x86_mmx_pslli_w:
+ NewIntNo = Intrinsic::x86_mmx_psll_w;
+ break;
+ case Intrinsic::x86_mmx_pslli_d:
+ NewIntNo = Intrinsic::x86_mmx_psll_d;
+ break;
+ case Intrinsic::x86_mmx_pslli_q:
+ NewIntNo = Intrinsic::x86_mmx_psll_q;
+ break;
+ case Intrinsic::x86_mmx_psrli_w:
+ NewIntNo = Intrinsic::x86_mmx_psrl_w;
+ break;
+ case Intrinsic::x86_mmx_psrli_d:
+ NewIntNo = Intrinsic::x86_mmx_psrl_d;
+ break;
+ case Intrinsic::x86_mmx_psrli_q:
+ NewIntNo = Intrinsic::x86_mmx_psrl_q;
+ break;
+ case Intrinsic::x86_mmx_psrai_w:
+ NewIntNo = Intrinsic::x86_mmx_psra_w;
+ break;
+ case Intrinsic::x86_mmx_psrai_d:
+ NewIntNo = Intrinsic::x86_mmx_psra_d;
+ break;
+ default: abort(); // Can't reach here.
+ }
+ break;
+ }
+ }
+ MVT VT = Op.getValueType();
+ ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt));
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(NewIntNo, MVT::i32),
+ Op.getOperand(1), ShAmt);
+ }
+ }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ DebugLoc dl = Op.getDebugLoc();
+
+ if (Depth > 0) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset =
+ DAG.getConstant(TD->getPointerSize(),
+ Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
+ return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ FrameAddr, Offset),
+ NULL, 0);
+ }
+
+ // Just load the return address.
+ SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+ return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+ RetAddrFI, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setFrameAddressIsTaken(true);
+ MVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
+ return FrameAddr;
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+ SelectionDAG &DAG) {
+ return DAG.getIntPtrConstant(2*TD->getPointerSize());
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+{
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ DebugLoc dl = Op.getDebugLoc();
+
+ SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
+ getPointerTy());
+ unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
+
+ SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
+ DAG.getIntPtrConstant(-TD->getPointerSize()));
+ StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
+ Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0);
+ Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+ MF.getRegInfo().addLiveOut(StoreAddrReg);
+
+ return DAG.getNode(X86ISD::EH_RETURN, dl,
+ MVT::Other,
+ Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
+}
+
+SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) {
+ SDValue Root = Op.getOperand(0);
+ SDValue Trmp = Op.getOperand(1); // trampoline
+ SDValue FPtr = Op.getOperand(2); // nested function
+ SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+ DebugLoc dl = Op.getDebugLoc();
+
+ const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+ const X86InstrInfo *TII =
+ ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
+
+ if (Subtarget->is64Bit()) {
+ SDValue OutChains[6];
+
+ // Large code-model.
+
+ const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r);
+ const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri);
+
+ const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
+ const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
+
+ const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+ // Load the pointer to the nested function into R11.
+ unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+ SDValue Addr = Trmp;
+ OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ Addr, TrmpAddr, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(2, MVT::i64));
+ OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2);
+
+ // Load the 'nest' parameter value into R10.
+ // R10 is specified in X86CallingConv.td
+ OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(10, MVT::i64));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ Addr, TrmpAddr, 10);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(12, MVT::i64));
+ OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2);
+
+ // Jump to the nested function.
+ OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(20, MVT::i64));
+ OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ Addr, TrmpAddr, 20);
+
+ unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(22, MVT::i64));
+ OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
+ TrmpAddr, 22);
+
+ SDValue Ops[] =
+ { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
+ return DAG.getMergeValues(Ops, 2, dl);
+ } else {
+ const Function *Func =
+ cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+ unsigned CC = Func->getCallingConv();
+ unsigned NestReg;
+
+ switch (CC) {
+ default:
+ assert(0 && "Unsupported calling convention");
+ case CallingConv::C:
+ case CallingConv::X86_StdCall: {
+ // Pass 'nest' parameter in ECX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::ECX;
+
+ // Check that ECX wasn't needed by an 'inreg' parameter.
+ const FunctionType *FTy = Func->getFunctionType();
+ const AttrListPtr &Attrs = Func->getAttributes();
+
+ if (!Attrs.isEmpty() && !Func->isVarArg()) {
+ unsigned InRegCount = 0;
+ unsigned Idx = 1;
+
+ for (FunctionType::param_iterator I = FTy->param_begin(),
+ E = FTy->param_end(); I != E; ++I, ++Idx)
+ if (Attrs.paramHasAttr(Idx, Attribute::InReg))
+ // FIXME: should only count parameters that are lowered to integers.
+ InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
+
+ if (InRegCount > 2) {
+ cerr << "Nest register in use - reduce number of inreg parameters!\n";
+ abort();
+ }
+ }
+ break;
+ }
+ case CallingConv::X86_FastCall:
+ case CallingConv::Fast:
+ // Pass 'nest' parameter in EAX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::EAX;
+ break;
+ }
+
+ SDValue OutChains[4];
+ SDValue Addr, Disp;
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(10, MVT::i32));
+ Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+ const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri);
+ const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
+ OutChains[0] = DAG.getStore(Root, dl,
+ DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
+ Trmp, TrmpAddr, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(1, MVT::i32));
+ OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1);
+
+ const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP);
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(5, MVT::i32));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
+ TrmpAddr, 5, false, 1);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(6, MVT::i32));
+ OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1);
+
+ SDValue Ops[] =
+ { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
+ return DAG.getMergeValues(Ops, 2, dl);
+ }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
+ /*
+ The rounding mode is in bits 11:10 of FPSR, and has the following
+ settings:
+ 00 Round to nearest
+ 01 Round to -inf
+ 10 Round to +inf
+ 11 Round to 0
+
+ FLT_ROUNDS, on the other hand, expects the following:
+ -1 Undefined
+ 0 Round to 0
+ 1 Round to nearest
+ 2 Round to +inf
+ 3 Round to -inf
+
+ To perform the conversion, we do:
+ (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+ */
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameInfo &TFI = *TM.getFrameInfo();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ MVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Save FP Control Word to stack slot
+ int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+ SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
+ DAG.getEntryNode(), StackSlot);
+
+ // Load FP Control Word from stack slot
+ SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0);
+
+ // Transform as necessary
+ SDValue CWD1 =
+ DAG.getNode(ISD::SRL, dl, MVT::i16,
+ DAG.getNode(ISD::AND, dl, MVT::i16,
+ CWD, DAG.getConstant(0x800, MVT::i16)),
+ DAG.getConstant(11, MVT::i8));
+ SDValue CWD2 =
+ DAG.getNode(ISD::SRL, dl, MVT::i16,
+ DAG.getNode(ISD::AND, dl, MVT::i16,
+ CWD, DAG.getConstant(0x400, MVT::i16)),
+ DAG.getConstant(9, MVT::i8));
+
+ SDValue RetVal =
+ DAG.getNode(ISD::AND, dl, MVT::i16,
+ DAG.getNode(ISD::ADD, dl, MVT::i16,
+ DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
+ DAG.getConstant(1, MVT::i16)),
+ DAG.getConstant(3, MVT::i16));
+
+
+ return DAG.getNode((VT.getSizeInBits() < 16 ?
+ ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getValueType();
+ MVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+ // If src is zero (i.e. bsr sets ZF), returns NumBits.
+ SmallVector<SDValue, 4> Ops;
+ Ops.push_back(Op);
+ Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT));
+ Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
+ Ops.push_back(Op.getValue(1));
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
+
+ // Finally xor with NumBits-1.
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getValueType();
+ MVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsf (scan bits forward) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+
+ // If src is zero (i.e. bsf sets ZF), returns NumBits.
+ SmallVector<SDValue, 4> Ops;
+ Ops.push_back(Op);
+ Ops.push_back(DAG.getConstant(NumBits, OpVT));
+ Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8));
+ Ops.push_back(Op.getValue(1));
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4);
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getValueType();
+ assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+ DebugLoc dl = Op.getDebugLoc();
+
+ // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
+ // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
+ // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
+ // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
+ // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
+ //
+ // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
+ // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
+ // return AloBlo + AloBhi + AhiBlo;
+
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+ A, DAG.getConstant(32, MVT::i32));
+ SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+ B, DAG.getConstant(32, MVT::i32));
+ SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+ A, B);
+ SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+ A, Bhi);
+ SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+ Ahi, B);
+ AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+ AloBhi, DAG.getConstant(32, MVT::i32));
+ AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+ AhiBlo, DAG.getConstant(32, MVT::i32));
+ SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+ Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+ return Res;
+}
+
+
+SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+ // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+ // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+ // looks for this combo and may remove the "setcc" instruction if the "setcc"
+ // has only one use.
+ SDNode *N = Op.getNode();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ unsigned BaseOp = 0;
+ unsigned Cond = 0;
+ DebugLoc dl = Op.getDebugLoc();
+
+ switch (Op.getOpcode()) {
+ default: assert(0 && "Unknown ovf instruction!");
+ case ISD::SADDO:
+ // A subtract of one will be selected as a INC. Note that INC doesn't
+ // set CF, so we can't do this for UADDO.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+ if (C->getAPIntValue() == 1) {
+ BaseOp = X86ISD::INC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UADDO:
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SSUBO:
+ // A subtract of one will be selected as a DEC. Note that DEC doesn't
+ // set CF, so we can't do this for USUBO.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+ if (C->getAPIntValue() == 1) {
+ BaseOp = X86ISD::DEC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_O;
+ break;
+ case ISD::USUBO:
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SMULO:
+ BaseOp = X86ISD::SMUL;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UMULO:
+ BaseOp = X86ISD::UMUL;
+ Cond = X86::COND_B;
+ break;
+ }
+
+ // Also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+ SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
+
+ SDValue SetCC =
+ DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
+ DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
+ return Sum;
+}
+
+SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
+ MVT T = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned Reg = 0;
+ unsigned size = 0;
+ switch(T.getSimpleVT()) {
+ default:
+ assert(false && "Invalid value type!");
+ case MVT::i8: Reg = X86::AL; size = 1; break;
+ case MVT::i16: Reg = X86::AX; size = 2; break;
+ case MVT::i32: Reg = X86::EAX; size = 4; break;
+ case MVT::i64:
+ assert(Subtarget->is64Bit() && "Node not type legal!");
+ Reg = X86::RAX; size = 8;
+ break;
+ }
+ SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
+ Op.getOperand(2), SDValue());
+ SDValue Ops[] = { cpIn.getValue(0),
+ Op.getOperand(1),
+ Op.getOperand(3),
+ DAG.getTargetConstant(size, MVT::i8),
+ cpIn.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
+ SDValue cpOut =
+ DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
+ return cpOut;
+}
+
+SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
+ SelectionDAG &DAG) {
+ assert(Subtarget->is64Bit() && "Result not type legalized?");
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue TheChain = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+ SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
+ SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
+ rax.getValue(2));
+ SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
+ DAG.getConstant(32, MVT::i8));
+ SDValue Ops[] = {
+ DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
+ rdx.getValue(1)
+ };
+ return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ DebugLoc dl = Node->getDebugLoc();
+ MVT T = Node->getValueType(0);
+ SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
+ DAG.getConstant(0, T), Node->getOperand(2));
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
+ cast<AtomicSDNode>(Node)->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), negOp,
+ cast<AtomicSDNode>(Node)->getSrcValue(),
+ cast<AtomicSDNode>(Node)->getAlignment());
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+ switch (Op.getOpcode()) {
+ default: assert(0 && "Should not custom lower this!");
+ case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG);
+ case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
+ case ISD::SHL_PARTS:
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS: return LowerShift(Op, DAG);
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+ case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::FABS: return LowerFABS(Op, DAG);
+ case ISD::FNEG: return LowerFNEG(Op, DAG);
+ case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
+ case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::VSETCC: return LowerVSETCC(Op, DAG);
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::CALL: return LowerCALL(Op, DAG);
+ case ISD::RET: return LowerRET(Op, DAG);
+ case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::VAARG: return LowerVAARG(Op, DAG);
+ case ISD::VACOPY: return LowerVACOPY(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::FRAME_TO_ARGS_OFFSET:
+ return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG);
+ case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::CTLZ: return LowerCTLZ(Op, DAG);
+ case ISD::CTTZ: return LowerCTTZ(Op, DAG);
+ case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ case ISD::SMULO:
+ case ISD::UMULO: return LowerXALUO(Op, DAG);
+ case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG);
+ }
+}
+
+void X86TargetLowering::
+ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG, unsigned NewOp) {
+ MVT T = Node->getValueType(0);
+ DebugLoc dl = Node->getDebugLoc();
+ assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+
+ SDValue Chain = Node->getOperand(0);
+ SDValue In1 = Node->getOperand(1);
+ SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Node->getOperand(2), DAG.getIntPtrConstant(0));
+ SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Node->getOperand(2), DAG.getIntPtrConstant(1));
+ // This is a generalized SDNode, not an AtomicSDNode, so it doesn't
+ // have a MemOperand. Pass the info through as a normal operand.
+ SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand());
+ SDValue Ops[] = { Chain, In1, In2L, In2H, LSI };
+ SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+ SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5);
+ SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(Result.getValue(2));
+}
+
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) {
+ DebugLoc dl = N->getDebugLoc();
+ switch (N->getOpcode()) {
+ default:
+ assert(false && "Do not know how to custom type legalize this operation!");
+ return;
+ case ISD::FP_TO_SINT: {
+ std::pair<SDValue,SDValue> Vals =
+ FP_TO_INTHelper(SDValue(N, 0), DAG, true);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ if (FIST.getNode() != 0) {
+ MVT VT = N->getValueType(0);
+ // Return a load from the stack slot.
+ Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0));
+ }
+ return;
+ }
+ case ISD::READCYCLECOUNTER: {
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue TheChain = N->getOperand(0);
+ SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+ SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
+ rd.getValue(1));
+ SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
+ eax.getValue(2));
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { eax, edx };
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
+ Results.push_back(edx.getValue(1));
+ return;
+ }
+ case ISD::ATOMIC_CMP_SWAP: {
+ MVT T = N->getValueType(0);
+ assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
+ SDValue cpInL, cpInH;
+ cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
+ DAG.getConstant(0, MVT::i32));
+ cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
+ DAG.getConstant(1, MVT::i32));
+ cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
+ cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
+ cpInL.getValue(1));
+ SDValue swapInL, swapInH;
+ swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
+ DAG.getConstant(0, MVT::i32));
+ swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
+ DAG.getConstant(1, MVT::i32));
+ swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
+ cpInH.getValue(1));
+ swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
+ swapInL.getValue(1));
+ SDValue Ops[] = { swapInH.getValue(0),
+ N->getOperand(1),
+ swapInH.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
+ SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
+ MVT::i32, Result.getValue(1));
+ SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
+ MVT::i32, cpOutL.getValue(2));
+ SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(cpOutH.getValue(1));
+ return;
+ }
+ case ISD::ATOMIC_LOAD_ADD:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_AND:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_NAND:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_OR:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_SUB:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_XOR:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
+ return;
+ case ISD::ATOMIC_SWAP:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
+ return;
+ }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch (Opcode) {
+ default: return NULL;
+ case X86ISD::BSF: return "X86ISD::BSF";
+ case X86ISD::BSR: return "X86ISD::BSR";
+ case X86ISD::SHLD: return "X86ISD::SHLD";
+ case X86ISD::SHRD: return "X86ISD::SHRD";
+ case X86ISD::FAND: return "X86ISD::FAND";
+ case X86ISD::FOR: return "X86ISD::FOR";
+ case X86ISD::FXOR: return "X86ISD::FXOR";
+ case X86ISD::FSRL: return "X86ISD::FSRL";
+ case X86ISD::FILD: return "X86ISD::FILD";
+ case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
+ case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+ case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+ case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+ case X86ISD::FLD: return "X86ISD::FLD";
+ case X86ISD::FST: return "X86ISD::FST";
+ case X86ISD::CALL: return "X86ISD::CALL";
+ case X86ISD::TAILCALL: return "X86ISD::TAILCALL";
+ case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
+ case X86ISD::BT: return "X86ISD::BT";
+ case X86ISD::CMP: return "X86ISD::CMP";
+ case X86ISD::COMI: return "X86ISD::COMI";
+ case X86ISD::UCOMI: return "X86ISD::UCOMI";
+ case X86ISD::SETCC: return "X86ISD::SETCC";
+ case X86ISD::CMOV: return "X86ISD::CMOV";
+ case X86ISD::BRCOND: return "X86ISD::BRCOND";
+ case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
+ case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
+ case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
+ case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
+ case X86ISD::Wrapper: return "X86ISD::Wrapper";
+ case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
+ case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
+ case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
+ case X86ISD::PINSRB: return "X86ISD::PINSRB";
+ case X86ISD::PINSRW: return "X86ISD::PINSRW";
+ case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
+ case X86ISD::FMAX: return "X86ISD::FMAX";
+ case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
+ case X86ISD::FRCP: return "X86ISD::FRCP";
+ case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
+ case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
+ case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
+ case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
+ case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
+ case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
+ case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
+ case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
+ case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG";
+ case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG";
+ case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG";
+ case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
+ case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
+ case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
+ case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VSHL: return "X86ISD::VSHL";
+ case X86ISD::VSRL: return "X86ISD::VSRL";
+ case X86ISD::CMPPD: return "X86ISD::CMPPD";
+ case X86ISD::CMPPS: return "X86ISD::CMPPS";
+ case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB";
+ case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW";
+ case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD";
+ case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ";
+ case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB";
+ case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW";
+ case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD";
+ case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ";
+ case X86ISD::ADD: return "X86ISD::ADD";
+ case X86ISD::SUB: return "X86ISD::SUB";
+ case X86ISD::SMUL: return "X86ISD::SMUL";
+ case X86ISD::UMUL: return "X86ISD::UMUL";
+ case X86ISD::INC: return "X86ISD::INC";
+ case X86ISD::DEC: return "X86ISD::DEC";
+ case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
+ }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+ const Type *Ty) const {
+ // X86 supports extremely general addressing modes.
+
+ // X86 allows a sign-extended 32-bit immediate field as a displacement.
+ if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1)
+ return false;
+
+ if (AM.BaseGV) {
+ // We can only fold this if we don't need an extra load.
+ if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
+ return false;
+ // If BaseGV requires a register, we cannot also have a BaseReg.
+ if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) &&
+ AM.HasBaseReg)
+ return false;
+
+ // X86-64 only supports addr of globals in small code model.
+ if (Subtarget->is64Bit()) {
+ if (getTargetMachine().getCodeModel() != CodeModel::Small)
+ return false;
+ // If lower 4G is not available, then we must use rip-relative addressing.
+ if (AM.BaseOffs || AM.Scale > 1)
+ return false;
+ }
+ }
+
+ switch (AM.Scale) {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ // These scales always work.
+ break;
+ case 3:
+ case 5:
+ case 9:
+ // These scales are formed with basereg+scalereg. Only accept if there is
+ // no basereg yet.
+ if (AM.HasBaseReg)
+ return false;
+ break;
+ default: // Other stuff never works.
+ return false;
+ }
+
+ return true;
+}
+
+
+bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
+ if (!Ty1->isInteger() || !Ty2->isInteger())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ if (NumBits1 <= NumBits2)
+ return false;
+ return Subtarget->is64Bit() || NumBits1 < 64;
+}
+
+bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const {
+ if (!VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ if (NumBits1 <= NumBits2)
+ return false;
+ return Subtarget->is64Bit() || NumBits1 < 64;
+}
+
+bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return Ty1 == Type::Int32Ty && Ty2 == Type::Int64Ty && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const {
+ // i16 instructions are longer (0x66 prefix) and potentially slower.
+ return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+ MVT VT) const {
+ // Only do shuffles on 128-bit vector types for now.
+ if (VT.getSizeInBits() == 64)
+ return false;
+
+ // FIXME: pshufb, blends, palignr, shifts.
+ return (VT.getVectorNumElements() == 2 ||
+ ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ isMOVLMask(M, VT) ||
+ isSHUFPMask(M, VT) ||
+ isPSHUFDMask(M, VT) ||
+ isPSHUFHWMask(M, VT) ||
+ isPSHUFLWMask(M, VT) ||
+ isUNPCKLMask(M, VT) ||
+ isUNPCKHMask(M, VT) ||
+ isUNPCKL_v_undef_Mask(M, VT) ||
+ isUNPCKH_v_undef_Mask(M, VT));
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ MVT VT) const {
+ unsigned NumElts = VT.getVectorNumElements();
+ // FIXME: This collection of masks seems suspect.
+ if (NumElts == 2)
+ return true;
+ if (NumElts == 4 && VT.getSizeInBits() == 128) {
+ return (isMOVLMask(Mask, VT) ||
+ isCommutedMOVLMask(Mask, VT, true) ||
+ isSHUFPMask(Mask, VT) ||
+ isCommutedSHUFPMask(Mask, VT));
+ }
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
+ MachineBasicBlock *MBB,
+ unsigned regOpc,
+ unsigned immOpc,
+ unsigned LoadOpc,
+ unsigned CXchgOpc,
+ unsigned copyOpc,
+ unsigned notOpc,
+ unsigned EAXreg,
+ TargetRegisterClass *RC,
+ bool invSrc) const {
+ // For the atomic bitwise operator, we generate
+ // thisMBB:
+ // newMBB:
+ // ld t1 = [bitinstr.addr]
+ // op t2 = t1, [bitinstr.val]
+ // mov EAX = t1
+ // lcs dest = [bitinstr.addr], t2 [EAX is implicit]
+ // bz newMBB
+ // fallthrough -->nextMBB
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction::iterator MBBIter = MBB;
+ ++MBBIter;
+
+ /// First build the CFG
+ MachineFunction *F = MBB->getParent();
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, newMBB);
+ F->insert(MBBIter, nextMBB);
+
+ // Move all successors to thisMBB to nextMBB
+ nextMBB->transferSuccessors(thisMBB);
+
+ // Update thisMBB to fall through to newMBB
+ thisMBB->addSuccessor(newMBB);
+
+ // newMBB jumps to itself and fall through to nextMBB
+ newMBB->addSuccessor(nextMBB);
+ newMBB->addSuccessor(newMBB);
+
+ // Insert instructions into newMBB based on incoming instruction
+ assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+ "unexpected number of operands");
+ DebugLoc dl = bInstr->getDebugLoc();
+ MachineOperand& destOper = bInstr->getOperand(0);
+ MachineOperand* argOpers[2 + X86AddrNumOperands];
+ int numArgs = bInstr->getNumOperands() - 1;
+ for (int i=0; i < numArgs; ++i)
+ argOpers[i] = &bInstr->getOperand(i+1);
+
+ // x86 address has 4 operands: base, index, scale, and displacement
+ int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+ int valArgIndx = lastAddrIndx + 1;
+
+ unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+ MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+
+ unsigned tt = F->getRegInfo().createVirtualRegister(RC);
+ if (invSrc) {
+ MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
+ }
+ else
+ tt = t1;
+
+ unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+ assert((argOpers[valArgIndx]->isReg() ||
+ argOpers[valArgIndx]->isImm()) &&
+ "invalid operand");
+ if (argOpers[valArgIndx]->isReg())
+ MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
+ else
+ MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
+ MIB.addReg(tt);
+ (*MIB).addOperand(*argOpers[valArgIndx]);
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
+ MIB.addReg(t1);
+
+ MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+ MIB.addReg(t2);
+ assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+ (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
+ MIB.addReg(EAXreg);
+
+ // insert branch
+ BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+
+ F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now.
+ return nextMBB;
+}
+
+// private utility function: 64 bit atomics on 32 bit host.
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
+ MachineBasicBlock *MBB,
+ unsigned regOpcL,
+ unsigned regOpcH,
+ unsigned immOpcL,
+ unsigned immOpcH,
+ bool invSrc) const {
+ // For the atomic bitwise operator, we generate
+ // thisMBB (instructions are in pairs, except cmpxchg8b)
+ // ld t1,t2 = [bitinstr.addr]
+ // newMBB:
+ // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
+ // op t5, t6 <- out1, out2, [bitinstr.val]
+ // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val])
+ // mov ECX, EBX <- t5, t6
+ // mov EAX, EDX <- t1, t2
+ // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit]
+ // mov t3, t4 <- EAX, EDX
+ // bz newMBB
+ // result in out1, out2
+ // fallthrough -->nextMBB
+
+ const TargetRegisterClass *RC = X86::GR32RegisterClass;
+ const unsigned LoadOpc = X86::MOV32rm;
+ const unsigned copyOpc = X86::MOV32rr;
+ const unsigned NotOpc = X86::NOT32r;
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction::iterator MBBIter = MBB;
+ ++MBBIter;
+
+ /// First build the CFG
+ MachineFunction *F = MBB->getParent();
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, newMBB);
+ F->insert(MBBIter, nextMBB);
+
+ // Move all successors to thisMBB to nextMBB
+ nextMBB->transferSuccessors(thisMBB);
+
+ // Update thisMBB to fall through to newMBB
+ thisMBB->addSuccessor(newMBB);
+
+ // newMBB jumps to itself and fall through to nextMBB
+ newMBB->addSuccessor(nextMBB);
+ newMBB->addSuccessor(newMBB);
+
+ DebugLoc dl = bInstr->getDebugLoc();
+ // Insert instructions into newMBB based on incoming instruction
+ // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
+ assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
+ "unexpected number of operands");
+ MachineOperand& dest1Oper = bInstr->getOperand(0);
+ MachineOperand& dest2Oper = bInstr->getOperand(1);
+ MachineOperand* argOpers[2 + X86AddrNumOperands];
+ for (int i=0; i < 2 + X86AddrNumOperands; ++i)
+ argOpers[i] = &bInstr->getOperand(i+2);
+
+ // x86 address has 4 operands: base, index, scale, and displacement
+ int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+
+ unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+ MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+ unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+ MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
+ // add 4 to displacement.
+ for (int i=0; i <= lastAddrIndx-2; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+ MachineOperand newOp3 = *(argOpers[3]);
+ if (newOp3.isImm())
+ newOp3.setImm(newOp3.getImm()+4);
+ else
+ newOp3.setOffset(newOp3.getOffset()+4);
+ (*MIB).addOperand(newOp3);
+ (*MIB).addOperand(*argOpers[lastAddrIndx]);
+
+ // t3/4 are defined later, at the bottom of the loop
+ unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
+ unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
+ BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
+ .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
+ BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
+ .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
+
+ unsigned tt1 = F->getRegInfo().createVirtualRegister(RC);
+ unsigned tt2 = F->getRegInfo().createVirtualRegister(RC);
+ if (invSrc) {
+ MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1);
+ MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2);
+ } else {
+ tt1 = t1;
+ tt2 = t2;
+ }
+
+ int valArgIndx = lastAddrIndx + 1;
+ assert((argOpers[valArgIndx]->isReg() ||
+ argOpers[valArgIndx]->isImm()) &&
+ "invalid operand");
+ unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
+ unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
+ if (argOpers[valArgIndx]->isReg())
+ MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
+ else
+ MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
+ if (regOpcL != X86::MOV32rr)
+ MIB.addReg(tt1);
+ (*MIB).addOperand(*argOpers[valArgIndx]);
+ assert(argOpers[valArgIndx + 1]->isReg() ==
+ argOpers[valArgIndx]->isReg());
+ assert(argOpers[valArgIndx + 1]->isImm() ==
+ argOpers[valArgIndx]->isImm());
+ if (argOpers[valArgIndx + 1]->isReg())
+ MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
+ else
+ MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
+ if (regOpcH != X86::MOV32rr)
+ MIB.addReg(tt2);
+ (*MIB).addOperand(*argOpers[valArgIndx + 1]);
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
+ MIB.addReg(t1);
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
+ MIB.addReg(t2);
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
+ MIB.addReg(t5);
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
+ MIB.addReg(t6);
+
+ MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+
+ assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+ (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
+ MIB.addReg(X86::EAX);
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
+ MIB.addReg(X86::EDX);
+
+ // insert branch
+ BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+
+ F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now.
+ return nextMBB;
+}
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
+ MachineBasicBlock *MBB,
+ unsigned cmovOpc) const {
+ // For the atomic min/max operator, we generate
+ // thisMBB:
+ // newMBB:
+ // ld t1 = [min/max.addr]
+ // mov t2 = [min/max.val]
+ // cmp t1, t2
+ // cmov[cond] t2 = t1
+ // mov EAX = t1
+ // lcs dest = [bitinstr.addr], t2 [EAX is implicit]
+ // bz newMBB
+ // fallthrough -->nextMBB
+ //
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction::iterator MBBIter = MBB;
+ ++MBBIter;
+
+ /// First build the CFG
+ MachineFunction *F = MBB->getParent();
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, newMBB);
+ F->insert(MBBIter, nextMBB);
+
+ // Move all successors to thisMBB to nextMBB
+ nextMBB->transferSuccessors(thisMBB);
+
+ // Update thisMBB to fall through to newMBB
+ thisMBB->addSuccessor(newMBB);
+
+ // newMBB jumps to newMBB and fall through to nextMBB
+ newMBB->addSuccessor(nextMBB);
+ newMBB->addSuccessor(newMBB);
+
+ DebugLoc dl = mInstr->getDebugLoc();
+ // Insert instructions into newMBB based on incoming instruction
+ assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+ "unexpected number of operands");
+ MachineOperand& destOper = mInstr->getOperand(0);
+ MachineOperand* argOpers[2 + X86AddrNumOperands];
+ int numArgs = mInstr->getNumOperands() - 1;
+ for (int i=0; i < numArgs; ++i)
+ argOpers[i] = &mInstr->getOperand(i+1);
+
+ // x86 address has 4 operands: base, index, scale, and displacement
+ int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+ int valArgIndx = lastAddrIndx + 1;
+
+ unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+
+ // We only support register and immediate values
+ assert((argOpers[valArgIndx]->isReg() ||
+ argOpers[valArgIndx]->isImm()) &&
+ "invalid operand");
+
+ unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ if (argOpers[valArgIndx]->isReg())
+ MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
+ else
+ MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
+ (*MIB).addOperand(*argOpers[valArgIndx]);
+
+ MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
+ MIB.addReg(t1);
+
+ MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
+ MIB.addReg(t1);
+ MIB.addReg(t2);
+
+ // Generate movc
+ unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
+ MIB.addReg(t2);
+ MIB.addReg(t1);
+
+ // Cmp and exchange if none has modified the memory location
+ MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+ MIB.addReg(t3);
+ assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+ (*MIB).addMemOperand(*F, *mInstr->memoperands_begin());
+
+ MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
+ MIB.addReg(X86::EAX);
+
+ // insert branch
+ BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+
+ F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now.
+ return nextMBB;
+}
+
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ DebugLoc dl = MI->getDebugLoc();
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ switch (MI->getOpcode()) {
+ default: assert(false && "Unexpected instr type to insert");
+ case X86::CMOV_V1I64:
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64: {
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = BB;
+ ++It;
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ unsigned Opc =
+ X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+ BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+ // Update machine-CFG edges by transferring all successors of the current
+ // block to the new block which will contain the Phi node for the select.
+ sinkMBB->transferSuccessors(BB);
+
+ // Add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ BB = sinkMBB;
+ BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+ .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+ F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
+ return BB;
+ }
+
+ case X86::FP32_TO_INT16_IN_MEM:
+ case X86::FP32_TO_INT32_IN_MEM:
+ case X86::FP32_TO_INT64_IN_MEM:
+ case X86::FP64_TO_INT16_IN_MEM:
+ case X86::FP64_TO_INT32_IN_MEM:
+ case X86::FP64_TO_INT64_IN_MEM:
+ case X86::FP80_TO_INT16_IN_MEM:
+ case X86::FP80_TO_INT32_IN_MEM:
+ case X86::FP80_TO_INT64_IN_MEM: {
+ // Change the floating point control register to use "round towards zero"
+ // mode when truncating to an integer value.
+ MachineFunction *F = BB->getParent();
+ int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
+ addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+ // Load the old value of the high byte of the control word...
+ unsigned OldCW =
+ F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
+ addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW),
+ CWFrameIdx);
+
+ // Set the high part to be round to zero...
+ addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx)
+ .addImm(0xC7F);
+
+ // Reload the modified control word now...
+ addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ // Restore the memory image of control word to original value
+ addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx)
+ .addReg(OldCW);
+
+ // Get the X86 opcode to use.
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: assert(0 && "illegal opcode!");
+ case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+ case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+ case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+ case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+ case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+ case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+ case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+ case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+ case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+ }
+
+ X86AddressMode AM;
+ MachineOperand &Op = MI->getOperand(0);
+ if (Op.isReg()) {
+ AM.BaseType = X86AddressMode::RegBase;
+ AM.Base.Reg = Op.getReg();
+ } else {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = Op.getIndex();
+ }
+ Op = MI->getOperand(1);
+ if (Op.isImm())
+ AM.Scale = Op.getImm();
+ Op = MI->getOperand(2);
+ if (Op.isImm())
+ AM.IndexReg = Op.getImm();
+ Op = MI->getOperand(3);
+ if (Op.isGlobal()) {
+ AM.GV = Op.getGlobal();
+ } else {
+ AM.Disp = Op.getImm();
+ }
+ addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM)
+ .addReg(MI->getOperand(X86AddrNumOperands).getReg());
+
+ // Reload the original control word now.
+ addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
+ return BB;
+ }
+ case X86::ATOMAND32:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+ X86::AND32ri, X86::MOV32rm,
+ X86::LCMPXCHG32, X86::MOV32rr,
+ X86::NOT32r, X86::EAX,
+ X86::GR32RegisterClass);
+ case X86::ATOMOR32:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
+ X86::OR32ri, X86::MOV32rm,
+ X86::LCMPXCHG32, X86::MOV32rr,
+ X86::NOT32r, X86::EAX,
+ X86::GR32RegisterClass);
+ case X86::ATOMXOR32:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
+ X86::XOR32ri, X86::MOV32rm,
+ X86::LCMPXCHG32, X86::MOV32rr,
+ X86::NOT32r, X86::EAX,
+ X86::GR32RegisterClass);
+ case X86::ATOMNAND32:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+ X86::AND32ri, X86::MOV32rm,
+ X86::LCMPXCHG32, X86::MOV32rr,
+ X86::NOT32r, X86::EAX,
+ X86::GR32RegisterClass, true);
+ case X86::ATOMMIN32:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
+ case X86::ATOMMAX32:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
+ case X86::ATOMUMIN32:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
+ case X86::ATOMUMAX32:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
+
+ case X86::ATOMAND16:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
+ X86::AND16ri, X86::MOV16rm,
+ X86::LCMPXCHG16, X86::MOV16rr,
+ X86::NOT16r, X86::AX,
+ X86::GR16RegisterClass);
+ case X86::ATOMOR16:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
+ X86::OR16ri, X86::MOV16rm,
+ X86::LCMPXCHG16, X86::MOV16rr,
+ X86::NOT16r, X86::AX,
+ X86::GR16RegisterClass);
+ case X86::ATOMXOR16:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
+ X86::XOR16ri, X86::MOV16rm,
+ X86::LCMPXCHG16, X86::MOV16rr,
+ X86::NOT16r, X86::AX,
+ X86::GR16RegisterClass);
+ case X86::ATOMNAND16:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
+ X86::AND16ri, X86::MOV16rm,
+ X86::LCMPXCHG16, X86::MOV16rr,
+ X86::NOT16r, X86::AX,
+ X86::GR16RegisterClass, true);
+ case X86::ATOMMIN16:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
+ case X86::ATOMMAX16:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
+ case X86::ATOMUMIN16:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
+ case X86::ATOMUMAX16:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
+
+ case X86::ATOMAND8:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
+ X86::AND8ri, X86::MOV8rm,
+ X86::LCMPXCHG8, X86::MOV8rr,
+ X86::NOT8r, X86::AL,
+ X86::GR8RegisterClass);
+ case X86::ATOMOR8:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
+ X86::OR8ri, X86::MOV8rm,
+ X86::LCMPXCHG8, X86::MOV8rr,
+ X86::NOT8r, X86::AL,
+ X86::GR8RegisterClass);
+ case X86::ATOMXOR8:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
+ X86::XOR8ri, X86::MOV8rm,
+ X86::LCMPXCHG8, X86::MOV8rr,
+ X86::NOT8r, X86::AL,
+ X86::GR8RegisterClass);
+ case X86::ATOMNAND8:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
+ X86::AND8ri, X86::MOV8rm,
+ X86::LCMPXCHG8, X86::MOV8rr,
+ X86::NOT8r, X86::AL,
+ X86::GR8RegisterClass, true);
+ // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+ // This group is for 64-bit host.
+ case X86::ATOMAND64:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
+ X86::AND64ri32, X86::MOV64rm,
+ X86::LCMPXCHG64, X86::MOV64rr,
+ X86::NOT64r, X86::RAX,
+ X86::GR64RegisterClass);
+ case X86::ATOMOR64:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
+ X86::OR64ri32, X86::MOV64rm,
+ X86::LCMPXCHG64, X86::MOV64rr,
+ X86::NOT64r, X86::RAX,
+ X86::GR64RegisterClass);
+ case X86::ATOMXOR64:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
+ X86::XOR64ri32, X86::MOV64rm,
+ X86::LCMPXCHG64, X86::MOV64rr,
+ X86::NOT64r, X86::RAX,
+ X86::GR64RegisterClass);
+ case X86::ATOMNAND64:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
+ X86::AND64ri32, X86::MOV64rm,
+ X86::LCMPXCHG64, X86::MOV64rr,
+ X86::NOT64r, X86::RAX,
+ X86::GR64RegisterClass, true);
+ case X86::ATOMMIN64:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
+ case X86::ATOMMAX64:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
+ case X86::ATOMUMIN64:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
+ case X86::ATOMUMAX64:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+
+ // This group does 64-bit operations on a 32-bit host.
+ case X86::ATOMAND6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::AND32rr, X86::AND32rr,
+ X86::AND32ri, X86::AND32ri,
+ false);
+ case X86::ATOMOR6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::OR32rr, X86::OR32rr,
+ X86::OR32ri, X86::OR32ri,
+ false);
+ case X86::ATOMXOR6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::XOR32rr, X86::XOR32rr,
+ X86::XOR32ri, X86::XOR32ri,
+ false);
+ case X86::ATOMNAND6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::AND32rr, X86::AND32rr,
+ X86::AND32ri, X86::AND32ri,
+ true);
+ case X86::ATOMADD6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::ADD32rr, X86::ADC32rr,
+ X86::ADD32ri, X86::ADC32ri,
+ false);
+ case X86::ATOMSUB6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::SUB32rr, X86::SBB32rr,
+ X86::SUB32ri, X86::SBB32ri,
+ false);
+ case X86::ATOMSWAP6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::MOV32rr, X86::MOV32rr,
+ X86::MOV32ri, X86::MOV32ri,
+ false);
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+ const APInt &Mask,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned Opc = Op.getOpcode();
+ assert((Opc >= ISD::BUILTIN_OP_END ||
+ Opc == ISD::INTRINSIC_WO_CHAIN ||
+ Opc == ISD::INTRINSIC_W_CHAIN ||
+ Opc == ISD::INTRINSIC_VOID) &&
+ "Should use MaskedValueIsZero if you don't know whether Op"
+ " is a target node!");
+
+ KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything.
+ switch (Opc) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::SMUL:
+ case X86ISD::UMUL:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ // These nodes' second result is a boolean.
+ if (Op.getResNo() == 0)
+ break;
+ // Fallthrough
+ case X86ISD::SETCC:
+ KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
+ Mask.getBitWidth() - 1);
+ break;
+ }
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+ GlobalValue* &GA, int64_t &Offset) const{
+ if (N->getOpcode() == X86ISD::Wrapper) {
+ if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+ GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+ Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+ return true;
+ }
+ }
+ return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
+ const TargetLowering &TLI) {
+ GlobalValue *GV;
+ int64_t Offset = 0;
+ if (TLI.isGAPlusOffset(Base, GV, Offset))
+ return (GV->getAlignment() >= N && (Offset % N) == 0);
+ // DAG combine handles the stack object case.
+ return false;
+}
+
+static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
+ MVT EVT, SDNode *&Base,
+ SelectionDAG &DAG, MachineFrameInfo *MFI,
+ const TargetLowering &TLI) {
+ Base = NULL;
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (N->getMaskElt(i) < 0) {
+ if (!Base)
+ return false;
+ continue;
+ }
+
+ SDValue Elt = DAG.getShuffleScalarElt(N, i);
+ if (!Elt.getNode() ||
+ (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+ return false;
+ if (!Base) {
+ Base = Elt.getNode();
+ if (Base->getOpcode() == ISD::UNDEF)
+ return false;
+ continue;
+ }
+ if (Elt.getOpcode() == ISD::UNDEF)
+ continue;
+
+ if (!TLI.isConsecutiveLoad(Elt.getNode(), Base,
+ EVT.getSizeInBits()/8, i, MFI))
+ return false;
+ }
+ return true;
+}
+
+/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
+/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
+/// if the load addresses are consecutive, non-overlapping, and in the right
+/// order. In the case of v2i64, it will see if it can rewrite the
+/// shuffle to be an appropriate build vector so it can take advantage of
+// performBuildVectorCombine.
+static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+ const TargetLowering &TLI) {
+ DebugLoc dl = N->getDebugLoc();
+ MVT VT = N->getValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // For x86-32 machines, if we see an insert and then a shuffle in a v2i64
+ // where the upper half is 0, it is advantageous to rewrite it as a build
+ // vector of (0, val) so it can use movq.
+ if (VT == MVT::v2i64) {
+ SDValue In[2];
+ In[0] = N->getOperand(0);
+ In[1] = N->getOperand(1);
+ int Idx0 = SVN->getMaskElt(0);
+ int Idx1 = SVN->getMaskElt(1);
+ // FIXME: can we take advantage of undef index?
+ if (Idx0 >= 0 && Idx1 >= 0 &&
+ In[Idx0/2].getOpcode() == ISD::INSERT_VECTOR_ELT &&
+ In[Idx1/2].getOpcode() == ISD::BUILD_VECTOR) {
+ ConstantSDNode* InsertVecIdx =
+ dyn_cast<ConstantSDNode>(In[Idx0/2].getOperand(2));
+ if (InsertVecIdx &&
+ InsertVecIdx->getZExtValue() == (unsigned)(Idx0 % 2) &&
+ isZeroNode(In[Idx1/2].getOperand(Idx1 % 2))) {
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+ In[Idx0/2].getOperand(1),
+ In[Idx1/2].getOperand(Idx1 % 2));
+ }
+ }
+ }
+
+ // Try to combine a vector_shuffle into a 128-bit load.
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ SDNode *Base = NULL;
+ if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, Base, DAG, MFI, TLI))
+ return SDValue();
+
+ LoadSDNode *LD = cast<LoadSDNode>(Base);
+ if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI))
+ return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
+ LD->getSrcValue(), LD->getSrcValueOffset(),
+ LD->isVolatile());
+ return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
+ LD->getSrcValue(), LD->getSrcValueOffset(),
+ LD->isVolatile(), LD->getAlignment());
+}
+
+/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
+static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget,
+ const TargetLowering &TLI) {
+ unsigned NumOps = N->getNumOperands();
+ DebugLoc dl = N->getDebugLoc();
+
+ // Ignore single operand BUILD_VECTOR.
+ if (NumOps == 1)
+ return SDValue();
+
+ MVT VT = N->getValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
+ // We are looking for load i64 and zero extend. We want to transform
+ // it before legalizer has a chance to expand it. Also look for i64
+ // BUILD_PAIR bit casted to f64.
+ return SDValue();
+ // This must be an insertion into a zero vector.
+ SDValue HighElt = N->getOperand(1);
+ if (!isZeroNode(HighElt))
+ return SDValue();
+
+ // Value must be a load.
+ SDNode *Base = N->getOperand(0).getNode();
+ if (!isa<LoadSDNode>(Base)) {
+ if (Base->getOpcode() != ISD::BIT_CONVERT)
+ return SDValue();
+ Base = Base->getOperand(0).getNode();
+ if (!isa<LoadSDNode>(Base))
+ return SDValue();
+ }
+
+ // Transform it into VZEXT_LOAD addr.
+ LoadSDNode *LD = cast<LoadSDNode>(Base);
+
+ // Load must not be an extload.
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return SDValue();
+
+ // Load type should legal type so we don't have to legalize it.
+ if (!TLI.isTypeLegal(VT))
+ return SDValue();
+
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
+ SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+ TargetLowering::TargetLoweringOpt TLO(DAG);
+ TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
+ DCI.CommitTargetLoweringOpt(TLO);
+ return ResNode;
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ DebugLoc DL = N->getDebugLoc();
+ SDValue Cond = N->getOperand(0);
+ // Get the LHS/RHS of the select.
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+
+ // If we have SSE[12] support, try to form min/max nodes.
+ if (Subtarget->hasSSE2() &&
+ (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
+ Cond.getOpcode() == ISD::SETCC) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ unsigned Opcode = 0;
+ if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOLE: // (X <= Y) ? X : Y -> min
+ case ISD::SETULE:
+ case ISD::SETLE:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min
+ case ISD::SETLT:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETOGT: // (X > Y) ? X : Y -> max
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max
+ case ISD::SETGE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOGT: // (X > Y) ? Y : X -> min
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min
+ case ISD::SETGE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETOLE: // (X <= Y) ? Y : X -> max
+ case ISD::SETULE:
+ case ISD::SETLE:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max
+ case ISD::SETLT:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ }
+
+ if (Opcode)
+ return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+ }
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+ // Don't do this for crazy integer types.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+ // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+ // so that TrueC (the true value) is larger than FalseC.
+ bool NeedsCondInvert = false;
+
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+ // Efficiently invertible.
+ (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
+ (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
+ isa<ConstantSDNode>(Cond.getOperand(1))))) {
+ NeedsCondInvert = true;
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
+ if (FalseC->getAPIntValue() == 0 &&
+ TrueC->getAPIntValue().isPowerOf2()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+ DAG.getConstant(ShAmt, MVT::i8));
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ return Cond;
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ DebugLoc DL = N->getDebugLoc();
+
+ // If the flag operand isn't dead, don't touch this CMOV.
+ if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+ return SDValue();
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations. Note that the operands are ordered the opposite of SELECT
+ // operands.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+ // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+ // larger than FalseC (the false value).
+ X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
+ // This is efficient for any integer data type (including i8/i16) and
+ // shift amount.
+ if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+ SDValue Cond = N->getOperand(3);
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(ShAmt, MVT::i8));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
+ // for any integer data type, including i8/i16.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ SDValue Cond = N->getOperand(3);
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ SDValue Cond = N->getOperand(3);
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DAG.getMachineFunction().
+ getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+ return SDValue();
+
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ MVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+ uint64_t MulAmt = C->getZExtValue();
+ if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+ return SDValue();
+
+ uint64_t MulAmt1 = 0;
+ uint64_t MulAmt2 = 0;
+ if ((MulAmt % 9) == 0) {
+ MulAmt1 = 9;
+ MulAmt2 = MulAmt / 9;
+ } else if ((MulAmt % 5) == 0) {
+ MulAmt1 = 5;
+ MulAmt2 = MulAmt / 5;
+ } else if ((MulAmt % 3) == 0) {
+ MulAmt1 = 3;
+ MulAmt2 = MulAmt / 3;
+ }
+ if (MulAmt2 &&
+ (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+ DebugLoc DL = N->getDebugLoc();
+
+ if (isPowerOf2_64(MulAmt2) &&
+ !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+ // If second multiplifer is pow2, issue it first. We want the multiply by
+ // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+ // is an add.
+ std::swap(MulAmt1, MulAmt2);
+
+ SDValue NewMul;
+ if (isPowerOf2_64(MulAmt1))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(MulAmt1, VT));
+
+ if (isPowerOf2_64(MulAmt2))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+ DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+ DAG.getConstant(MulAmt2, VT));
+
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, NewMul, false);
+ }
+ return SDValue();
+}
+
+
+/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
+/// when possible.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // On X86 with SSE2 support, we can transform this to a vector shift if
+ // all elements are shifted by the same amount. We can't do this in legalize
+ // because the a constant vector is typically transformed to a constant pool
+ // so we have no knowledge of the shift amount.
+ if (!Subtarget->hasSSE2())
+ return SDValue();
+
+ MVT VT = N->getValueType(0);
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+ return SDValue();
+
+ SDValue ShAmtOp = N->getOperand(1);
+ MVT EltVT = VT.getVectorElementType();
+ DebugLoc DL = N->getDebugLoc();
+ SDValue BaseShAmt;
+ if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned i = 0;
+ for (; i != NumElts; ++i) {
+ SDValue Arg = ShAmtOp.getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ BaseShAmt = Arg;
+ break;
+ }
+ for (; i != NumElts; ++i) {
+ SDValue Arg = ShAmtOp.getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ if (Arg != BaseShAmt) {
+ return SDValue();
+ }
+ }
+ } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
+ BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
+ DAG.getIntPtrConstant(0));
+ } else
+ return SDValue();
+
+ if (EltVT.bitsGT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
+ else if (EltVT.bitsLT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt);
+
+ // The shift amount is identical so we can do a vector shift.
+ SDValue ValOp = N->getOperand(0);
+ switch (N->getOpcode()) {
+ default:
+ assert(0 && "Unknown shift opcode!");
+ break;
+ case ISD::SHL:
+ if (VT == MVT::v2i64)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v4i32)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v8i16)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+ ValOp, BaseShAmt);
+ break;
+ case ISD::SRA:
+ if (VT == MVT::v4i32)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v8i16)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
+ ValOp, BaseShAmt);
+ break;
+ case ISD::SRL:
+ if (VT == MVT::v2i64)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v4i32)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v8i16)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+ ValOp, BaseShAmt);
+ break;
+ }
+ return SDValue();
+}
+
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
+ // the FP state in cases where an emms may be missing.
+ // A preferable solution to the general problem is to figure out the right
+ // places to insert EMMS. This qualifies as a quick hack.
+
+ // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ MVT VT = St->getValue().getValueType();
+ if (VT.getSizeInBits() != 64)
+ return SDValue();
+
+ bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2();
+ if ((VT.isVector() ||
+ (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+ isa<LoadSDNode>(St->getValue()) &&
+ !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+ St->getChain().hasOneUse() && !St->isVolatile()) {
+ SDNode* LdVal = St->getValue().getNode();
+ LoadSDNode *Ld = 0;
+ int TokenFactorIndex = -1;
+ SmallVector<SDValue, 8> Ops;
+ SDNode* ChainVal = St->getChain().getNode();
+ // Must be a store of a load. We currently handle two cases: the load
+ // is a direct child, and it's under an intervening TokenFactor. It is
+ // possible to dig deeper under nested TokenFactors.
+ if (ChainVal == LdVal)
+ Ld = cast<LoadSDNode>(St->getChain());
+ else if (St->getValue().hasOneUse() &&
+ ChainVal->getOpcode() == ISD::TokenFactor) {
+ for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
+ if (ChainVal->getOperand(i).getNode() == LdVal) {
+ TokenFactorIndex = i;
+ Ld = cast<LoadSDNode>(St->getValue());
+ } else
+ Ops.push_back(ChainVal->getOperand(i));
+ }
+ }
+
+ if (!Ld || !ISD::isNormalLoad(Ld))
+ return SDValue();
+
+ // If this is not the MMX case, i.e. we are just turning i64 load/store
+ // into f64 load/store, avoid the transformation if there are multiple
+ // uses of the loaded value.
+ if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+ return SDValue();
+
+ DebugLoc LdDL = Ld->getDebugLoc();
+ DebugLoc StDL = N->getDebugLoc();
+ // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+ // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+ // pair instead.
+ if (Subtarget->is64Bit() || F64IsLegal) {
+ MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+ SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getSrcValue(),
+ Ld->getSrcValueOffset(), Ld->isVolatile(),
+ Ld->getAlignment());
+ SDValue NewChain = NewLd.getValue(1);
+ if (TokenFactorIndex != -1) {
+ Ops.push_back(NewChain);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+ Ops.size());
+ }
+ return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+ St->getSrcValue(), St->getSrcValueOffset(),
+ St->isVolatile(), St->getAlignment());
+ }
+
+ // Otherwise, lower to two pairs of 32-bit loads / stores.
+ SDValue LoAddr = Ld->getBasePtr();
+ SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, MVT::i32));
+
+ SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+ Ld->getSrcValue(), Ld->getSrcValueOffset(),
+ Ld->isVolatile(), Ld->getAlignment());
+ SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+ Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
+ Ld->isVolatile(),
+ MinAlign(Ld->getAlignment(), 4));
+
+ SDValue NewChain = LoLd.getValue(1);
+ if (TokenFactorIndex != -1) {
+ Ops.push_back(LoLd);
+ Ops.push_back(HiLd);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+ Ops.size());
+ }
+
+ LoAddr = St->getBasePtr();
+ HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, MVT::i32));
+
+ SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+ St->getSrcValue(), St->getSrcValueOffset(),
+ St->isVolatile(), St->getAlignment());
+ SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+ St->getSrcValue(),
+ St->getSrcValueOffset() + 4,
+ St->isVolatile(),
+ MinAlign(St->getAlignment(), 4));
+ return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+ }
+ return SDValue();
+}
+
+/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
+/// X86ISD::FXOR nodes.
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+ // F[X]OR(0.0, x) -> x
+ // F[X]OR(x, 0.0) -> x
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(0);
+ return SDValue();
+}
+
+/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+ // FAND(0.0, x) -> 0.0
+ // FAND(x, 0.0) -> 0.0
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(0);
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ return SDValue();
+}
+
+static SDValue PerformBTCombine(SDNode *N,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // BT ignores high bits in the bit index operand.
+ SDValue Op1 = N->getOperand(1);
+ if (Op1.hasOneUse()) {
+ unsigned BitWidth = Op1.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG);
+ TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+ return SDValue();
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
+ case ISD::BUILD_VECTOR:
+ return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this);
+ case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
+ case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
+ case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
+ case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
+ case X86ISD::FXOR:
+ case X86ISD::FOR: return PerformFORCombine(N, DAG);
+ case X86ISD::FAND: return PerformFANDCombine(N, DAG);
+ case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
+ }
+
+ return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'A':
+ return C_Register;
+ case 'f':
+ case 'r':
+ case 'R':
+ case 'l':
+ case 'q':
+ case 'Q':
+ case 'x':
+ case 'y':
+ case 'Y':
+ return C_RegisterClass;
+ case 'e':
+ case 'Z':
+ return C_Other;
+ default:
+ break;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *X86TargetLowering::
+LowerXConstraint(MVT ConstraintVT) const {
+ // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+ // 'f' like normal targets.
+ if (ConstraintVT.isFloatingPoint()) {
+ if (Subtarget->hasSSE2())
+ return "Y";
+ if (Subtarget->hasSSE1())
+ return "x";
+ }
+
+ return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ char Constraint,
+ bool hasMemory,
+ std::vector<SDValue>&Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result(0, 0);
+
+ switch (Constraint) {
+ default: break;
+ case 'I':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 31) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'J':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 63) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'N':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 255) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'e': {
+ // 32-bit signed value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ const ConstantInt *CI = C->getConstantIntValue();
+ if (CI->isValueValidForType(Type::Int32Ty, C->getSExtValue())) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
+ break;
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ }
+ return;
+ }
+ case 'Z': {
+ // 32-bit unsigned value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ const ConstantInt *CI = C->getConstantIntValue();
+ if (CI->isValueValidForType(Type::Int32Ty, C->getZExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ return;
+ }
+ case 'i': {
+ // Literal immediates are always ok.
+ if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
+ break;
+ }
+
+ // If we are in non-pic codegen mode, we allow the address of a global (with
+ // an optional displacement) to be used with 'i'.
+ GlobalAddressSDNode *GA = 0;
+ int64_t Offset = 0;
+
+ // Match either (GA), (GA+C), (GA+C1+C2), etc.
+ while (1) {
+ if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+ Offset += GA->getOffset();
+ break;
+ } else if (Op.getOpcode() == ISD::ADD) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ } else if (Op.getOpcode() == ISD::SUB) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += -C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ }
+
+ // Otherwise, this isn't something we can handle, reject it.
+ return;
+ }
+
+ if (hasMemory)
+ Op = LowerGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), Offset, DAG);
+ else
+ Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+ Offset);
+ Result = Op;
+ break;
+ }
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
+ Ops, DAG);
+}
+
+std::vector<unsigned> X86TargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+ MVT VT) const {
+ if (Constraint.size() == 1) {
+ // FIXME: not handling fp-stack yet!
+ switch (Constraint[0]) { // GCC X86 Constraint Letters
+ default: break; // Unknown constraint letter
+ case 'q': // Q_REGS (GENERAL_REGS in 64-bit mode)
+ case 'Q': // Q_REGS
+ if (VT == MVT::i32)
+ return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
+ else if (VT == MVT::i16)
+ return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
+ else if (VT == MVT::i8)
+ return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
+ else if (VT == MVT::i64)
+ return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
+ break;
+ }
+ }
+
+ return std::vector<unsigned>();
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+ MVT VT) const {
+ // First, see if this is a constraint that directly corresponds to an LLVM
+ // register class.
+ if (Constraint.size() == 1) {
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ default: break;
+ case 'r': // GENERAL_REGS
+ case 'R': // LEGACY_REGS
+ case 'l': // INDEX_REGS
+ if (VT == MVT::i8)
+ return std::make_pair(0U, X86::GR8RegisterClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, X86::GR16RegisterClass);
+ if (VT == MVT::i32 || !Subtarget->is64Bit())
+ return std::make_pair(0U, X86::GR32RegisterClass);
+ return std::make_pair(0U, X86::GR64RegisterClass);
+ case 'f': // FP Stack registers.
+ // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+ // value to the correct fpstack register class.
+ if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, X86::RFP32RegisterClass);
+ if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, X86::RFP64RegisterClass);
+ return std::make_pair(0U, X86::RFP80RegisterClass);
+ case 'y': // MMX_REGS if MMX allowed.
+ if (!Subtarget->hasMMX()) break;
+ return std::make_pair(0U, X86::VR64RegisterClass);
+ case 'Y': // SSE_REGS if SSE2 allowed
+ if (!Subtarget->hasSSE2()) break;
+ // FALL THROUGH.
+ case 'x': // SSE_REGS if SSE1 allowed
+ if (!Subtarget->hasSSE1()) break;
+
+ switch (VT.getSimpleVT()) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ return std::make_pair(0U, X86::FR32RegisterClass);
+ case MVT::f64:
+ case MVT::i64:
+ return std::make_pair(0U, X86::FR64RegisterClass);
+ // Vector types.
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return std::make_pair(0U, X86::VR128RegisterClass);
+ }
+ break;
+ }
+ }
+
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ std::pair<unsigned, const TargetRegisterClass*> Res;
+ Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+ // Not found as a standard register?
+ if (Res.second == 0) {
+ // GCC calls "st(0)" just plain "st".
+ if (StringsEqualNoCase("{st}", Constraint)) {
+ Res.first = X86::ST0;
+ Res.second = X86::RFP80RegisterClass;
+ }
+ // 'A' means EAX + EDX.
+ if (Constraint == "A") {
+ Res.first = X86::EAX;
+ Res.second = X86::GRADRegisterClass;
+ }
+ return Res;
+ }
+
+ // Otherwise, check to see if this is a register class of the wrong value
+ // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+ // turn into {ax},{dx}.
+ if (Res.second->hasType(VT))
+ return Res; // Correct type already, nothing to do.
+
+ // All of the single-register GCC register classes map their values onto
+ // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
+ // really want an 8-bit or 32-bit register, map to the appropriate register
+ // class and return the appropriate register.
+ if (Res.second == X86::GR16RegisterClass) {
+ if (VT == MVT::i8) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::AL; break;
+ case X86::DX: DestReg = X86::DL; break;
+ case X86::CX: DestReg = X86::CL; break;
+ case X86::BX: DestReg = X86::BL; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = X86::GR8RegisterClass;
+ }
+ } else if (VT == MVT::i32) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::EAX; break;
+ case X86::DX: DestReg = X86::EDX; break;
+ case X86::CX: DestReg = X86::ECX; break;
+ case X86::BX: DestReg = X86::EBX; break;
+ case X86::SI: DestReg = X86::ESI; break;
+ case X86::DI: DestReg = X86::EDI; break;
+ case X86::BP: DestReg = X86::EBP; break;
+ case X86::SP: DestReg = X86::ESP; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = X86::GR32RegisterClass;
+ }
+ } else if (VT == MVT::i64) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::RAX; break;
+ case X86::DX: DestReg = X86::RDX; break;
+ case X86::CX: DestReg = X86::RCX; break;
+ case X86::BX: DestReg = X86::RBX; break;
+ case X86::SI: DestReg = X86::RSI; break;
+ case X86::DI: DestReg = X86::RDI; break;
+ case X86::BP: DestReg = X86::RBP; break;
+ case X86::SP: DestReg = X86::RSP; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = X86::GR64RegisterClass;
+ }
+ }
+ } else if (Res.second == X86::FR32RegisterClass ||
+ Res.second == X86::FR64RegisterClass ||
+ Res.second == X86::VR128RegisterClass) {
+ // Handle references to XMM physical registers that got mapped into the
+ // wrong class. This can happen with constraints like {xmm0} where the
+ // target independent register mapper will just pick the first match it can
+ // find, ignoring the required type.
+ if (VT == MVT::f32)
+ Res.second = X86::FR32RegisterClass;
+ else if (VT == MVT::f64)
+ Res.second = X86::FR64RegisterClass;
+ else if (X86::VR128RegisterClass->hasType(VT))
+ Res.second = X86::VR128RegisterClass;
+ }
+
+ return Res;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Widen vector type
+//===----------------------------------------------------------------------===//
+
+/// getWidenVectorType: given a vector type, returns the type to widen
+/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
+/// If there is no vector type that we want to widen to, returns MVT::Other
+/// When and where to widen is target dependent based on the cost of
+/// scalarizing vs using the wider vector type.
+
+MVT X86TargetLowering::getWidenVectorType(MVT VT) const {
+ assert(VT.isVector());
+ if (isTypeLegal(VT))
+ return VT;
+
+ // TODO: In computeRegisterProperty, we can compute the list of legal vector
+ // type based on element type. This would speed up our search (though
+ // it may not be worth it since the size of the list is relatively
+ // small).
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NElts = VT.getVectorNumElements();
+
+ // On X86, it make sense to widen any vector wider than 1
+ if (NElts <= 1)
+ return MVT::Other;
+
+ for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE;
+ nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+ MVT SVT = (MVT::SimpleValueType)nVT;
+
+ if (isTypeLegal(SVT) &&
+ SVT.getVectorElementType() == EltVT &&
+ SVT.getVectorNumElements() > NElts)
+ return SVT;
+ }
+ return MVT::Other;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..550f8bd
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,705 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "X86Subtarget.h"
+#include "X86RegisterInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+ namespace X86ISD {
+ // X86 Specific DAG Nodes
+ enum NodeType {
+ // Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// BSF - Bit scan forward.
+ /// BSR - Bit scan reverse.
+ BSF,
+ BSR,
+
+ /// SHLD, SHRD - Double shift instructions. These correspond to
+ /// X86::SHLDxx and X86::SHRDxx instructions.
+ SHLD,
+ SHRD,
+
+ /// FAND - Bitwise logical AND of floating point values. This corresponds
+ /// to X86::ANDPS or X86::ANDPD.
+ FAND,
+
+ /// FOR - Bitwise logical OR of floating point values. This corresponds
+ /// to X86::ORPS or X86::ORPD.
+ FOR,
+
+ /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+ /// to X86::XORPS or X86::XORPD.
+ FXOR,
+
+ /// FSRL - Bitwise logical right shift of floating point values. These
+ /// corresponds to X86::PSRLDQ.
+ FSRL,
+
+ /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+ /// integer source in memory and FP reg result. This corresponds to the
+ /// X86::FILD*m instructions. It has three inputs (token chain, address,
+ /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+ /// also produces a flag).
+ FILD,
+ FILD_FLAG,
+
+ /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+ /// integer destination in memory and a FP reg source. This corresponds
+ /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+ /// has two inputs (token chain and address) and two outputs (int value
+ /// and token chain).
+ FP_TO_INT16_IN_MEM,
+ FP_TO_INT32_IN_MEM,
+ FP_TO_INT64_IN_MEM,
+
+ /// FLD - This instruction implements an extending load to FP stack slots.
+ /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+ /// operand, ptr to load from, and a ValueType node indicating the type
+ /// to load to.
+ FLD,
+
+ /// FST - This instruction implements a truncating store to FP stack
+ /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+ /// chain operand, value to store, address, and a ValueType to store it
+ /// as.
+ FST,
+
+ /// CALL/TAILCALL - These operations represent an abstract X86 call
+ /// instruction, which includes a bunch of information. In particular the
+ /// operands of these node are:
+ ///
+ /// #0 - The incoming token chain
+ /// #1 - The callee
+ /// #2 - The number of arg bytes the caller pushes on the stack.
+ /// #3 - The number of arg bytes the callee pops off the stack.
+ /// #4 - The value to pass in AL/AX/EAX (optional)
+ /// #5 - The value to pass in DL/DX/EDX (optional)
+ ///
+ /// The result values of these nodes are:
+ ///
+ /// #0 - The outgoing token chain
+ /// #1 - The first register result value (optional)
+ /// #2 - The second register result value (optional)
+ ///
+ /// The CALL vs TAILCALL distinction boils down to whether the callee is
+ /// known not to modify the caller's stack frame, as is standard with
+ /// LLVM.
+ CALL,
+ TAILCALL,
+
+ /// RDTSC_DAG - This operation implements the lowering for
+ /// readcyclecounter
+ RDTSC_DAG,
+
+ /// X86 compare and logical compare instructions.
+ CMP, COMI, UCOMI,
+
+ /// X86 bit-test instructions.
+ BT,
+
+ /// X86 SetCC. Operand 0 is condition code, and operand 1 is the flag
+ /// operand produced by a CMP instruction.
+ SETCC,
+
+ /// X86 conditional moves. Operand 0 and operand 1 are the two values
+ /// to select from. Operand 2 is the condition code, and operand 3 is the
+ /// flag operand produced by a CMP or TEST instruction. It also writes a
+ /// flag result.
+ CMOV,
+
+ /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ /// Return with a flag operand. Operand 0 is the chain operand, operand
+ /// 1 is the number of bytes of stack to pop.
+ RET_FLAG,
+
+ /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+ REP_STOS,
+
+ /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+ REP_MOVS,
+
+ /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// Wrapper - A wrapper node for TargetConstantPool,
+ /// TargetExternalSymbol, and TargetGlobalAddress.
+ Wrapper,
+
+ /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+ /// relative displacements.
+ WrapperRIP,
+
+ /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRB.
+ PEXTRB,
+
+ /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRW.
+ PEXTRW,
+
+ /// INSERTPS - Insert any element of a 4 x float vector into any element
+ /// of a destination 4 x floatvector.
+ INSERTPS,
+
+ /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRB.
+ PINSRB,
+
+ /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRW.
+ PINSRW,
+
+ /// PSHUFB - Shuffle 16 8-bit values within a vector.
+ PSHUFB,
+
+ /// FMAX, FMIN - Floating point max and min.
+ ///
+ FMAX, FMIN,
+
+ /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+ /// approximation. Note that these typically require refinement
+ /// in order to obtain suitable precision.
+ FRSQRT, FRCP,
+
+ // TLSADDR - Thread Local Storage.
+ TLSADDR,
+
+ // SegmentBaseAddress - The address segment:0
+ SegmentBaseAddress,
+
+ // EH_RETURN - Exception Handling helpers.
+ EH_RETURN,
+
+ /// TC_RETURN - Tail call return.
+ /// operand #0 chain
+ /// operand #1 callee (register or absolute)
+ /// operand #2 stack adjustment
+ /// operand #3 optional in flag
+ TC_RETURN,
+
+ // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
+ LCMPXCHG_DAG,
+ LCMPXCHG8_DAG,
+
+ // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
+ // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
+ // Atomic 64-bit binary operations.
+ ATOMADD64_DAG,
+ ATOMSUB64_DAG,
+ ATOMOR64_DAG,
+ ATOMXOR64_DAG,
+ ATOMAND64_DAG,
+ ATOMNAND64_DAG,
+ ATOMSWAP64_DAG,
+
+ // FNSTCW16m - Store FP control world into i16 memory.
+ FNSTCW16m,
+
+ // VZEXT_MOVL - Vector move low and zero extend.
+ VZEXT_MOVL,
+
+ // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
+ VZEXT_LOAD,
+
+ // VSHL, VSRL - Vector logical left / right shift.
+ VSHL, VSRL,
+
+ // CMPPD, CMPPS - Vector double/float comparison.
+ // CMPPD, CMPPS - Vector double/float comparison.
+ CMPPD, CMPPS,
+
+ // PCMP* - Vector integer comparisons.
+ PCMPEQB, PCMPEQW, PCMPEQD, PCMPEQQ,
+ PCMPGTB, PCMPGTW, PCMPGTD, PCMPGTQ,
+
+ // ADD, SUB, SMUL, UMUL, etc. - Arithmetic operations with FLAGS results.
+ ADD, SUB, SMUL, UMUL,
+ INC, DEC,
+
+ // MUL_IMM - X86 specific multiply by immediate.
+ MUL_IMM
+ };
+ }
+
+ /// Define some predicates that are used for node matching.
+ namespace X86 {
+ /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+ bool isPSHUFDMask(ShuffleVectorSDNode *N);
+
+ /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+ bool isPSHUFHWMask(ShuffleVectorSDNode *N);
+
+ /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+ bool isPSHUFLWMask(ShuffleVectorSDNode *N);
+
+ /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to SHUFP*.
+ bool isSHUFPMask(ShuffleVectorSDNode *N);
+
+ /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+ bool isMOVHLPSMask(ShuffleVectorSDNode *N);
+
+ /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+ /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+ /// <2, 3, 2, 3>
+ bool isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N);
+
+ /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for MOVLP{S|D}.
+ bool isMOVLPMask(ShuffleVectorSDNode *N);
+
+ /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for MOVHP{S|D}.
+ /// as well as MOVLHPS.
+ bool isMOVHPMask(ShuffleVectorSDNode *N);
+
+ /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to UNPCKL.
+ bool isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+
+ /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to UNPCKH.
+ bool isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+
+ /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+ /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+ /// <0, 0, 1, 1>
+ bool isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N);
+
+ /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+ /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+ /// <2, 2, 3, 3>
+ bool isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N);
+
+ /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVSS,
+ /// MOVSD, and MOVD, i.e. setting the lowest element.
+ bool isMOVLMask(ShuffleVectorSDNode *N);
+
+ /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+ bool isMOVSHDUPMask(ShuffleVectorSDNode *N);
+
+ /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+ bool isMOVSLDUPMask(ShuffleVectorSDNode *N);
+
+ /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+ bool isMOVDDUPMask(ShuffleVectorSDNode *N);
+
+ /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+ /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+ /// instructions.
+ unsigned getShuffleSHUFImmediate(SDNode *N);
+
+ /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+ /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+ /// instructions.
+ unsigned getShufflePSHUFHWImmediate(SDNode *N);
+
+ /// getShufflePSHUFKWImmediate - Return the appropriate immediate to shuffle
+ /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+ /// instructions.
+ unsigned getShufflePSHUFLWImmediate(SDNode *N);
+ }
+
+ //===--------------------------------------------------------------------===//
+ // X86TargetLowering - X86 Implementation of the TargetLowering interface
+ class X86TargetLowering : public TargetLowering {
+ int VarArgsFrameIndex; // FrameIndex for start of varargs area.
+ int RegSaveFrameIndex; // X86-64 vararg func register save area.
+ unsigned VarArgsGPOffset; // X86-64 vararg func int reg offset.
+ unsigned VarArgsFPOffset; // X86-64 vararg func fp reg offset.
+ int BytesToPopOnReturn; // Number of arg bytes ret should pop.
+ int BytesCallerReserves; // Number of arg bytes caller makes.
+
+ public:
+ explicit X86TargetLowering(X86TargetMachine &TM);
+
+ /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+ /// jumptable.
+ SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const;
+
+ // Return the number of bytes that a function should pop when it returns (in
+ // addition to the space used by the return address).
+ //
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+
+ // Return the number of bytes that the caller reserves for arguments passed
+ // to this function.
+ unsigned getBytesCallerReserves() const { return BytesCallerReserves; }
+
+ /// getStackPtrReg - Return the stack pointer register we are using: either
+ /// ESP or RSP.
+ unsigned getStackPtrReg() const { return X86StackPtr; }
+
+ /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+ /// function arguments in the caller parameter area. For X86, aggregates
+ /// that contains are placed at 16-byte boundaries while the rest are at
+ /// 4-byte boundaries.
+ virtual unsigned getByValTypeAlignment(const Type *Ty) const;
+
+ /// getOptimalMemOpType - Returns the target specific optimal type for load
+ /// and store operations as a result of memset, memcpy, and memmove
+ /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
+ /// determining it.
+ virtual
+ MVT getOptimalMemOpType(uint64_t Size, unsigned Align,
+ bool isSrcConst, bool isSrcStr) const;
+
+ /// LowerOperation - Provide custom lowering hooks for some operations.
+ ///
+ virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+ /// ReplaceNodeResults - Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG);
+
+
+ virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+
+ /// getTargetNodeName - This method returns the name of a target specific
+ /// DAG node.
+ virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+ /// getSetCCResultType - Return the ISD::SETCC ValueType
+ virtual MVT getSetCCResultType(MVT VT) const;
+
+ /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+ /// in Mask are known to be either zero or one and return them in the
+ /// KnownZero/KnownOne bitsets.
+ virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+ const APInt &Mask,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const;
+
+ virtual bool
+ isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) const;
+
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG);
+
+ ConstraintType getConstraintType(const std::string &Constraint) const;
+
+ std::vector<unsigned>
+ getRegClassForInlineAsmConstraint(const std::string &Constraint,
+ MVT VT) const;
+
+ virtual const char *LowerXConstraint(MVT ConstraintVT) const;
+
+ /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+ /// vector. If it is invalid, don't add anything to Ops. If hasMemory is
+ /// true it means one of the asm constraint of the inline asm instruction
+ /// being processed is 'm'.
+ virtual void LowerAsmOperandForConstraint(SDValue Op,
+ char ConstraintLetter,
+ bool hasMemory,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const;
+
+ /// getRegForInlineAsmConstraint - Given a physical register constraint
+ /// (e.g. {edx}), return the register number and the register class for the
+ /// register. This should only be used for C_Register constraints. On
+ /// error, this returns a register number of 0.
+ std::pair<unsigned, const TargetRegisterClass*>
+ getRegForInlineAsmConstraint(const std::string &Constraint,
+ MVT VT) const;
+
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+
+ /// isTruncateFree - Return true if it's free to truncate a value of
+ /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+ /// register EAX to i16 by referencing its sub-register AX.
+ virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
+ virtual bool isTruncateFree(MVT VT1, MVT VT2) const;
+
+ /// isZExtFree - Return true if any actual instruction that defines a
+ /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+ /// register. This does not necessarily include registers defined in
+ /// unknown ways, such as incoming arguments, or copies from unknown
+ /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+ /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+ /// all instructions that define 32-bit values implicit zero-extend the
+ /// result out to 64 bits.
+ virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+ virtual bool isZExtFree(MVT VT1, MVT VT2) const;
+
+ /// isNarrowingProfitable - Return true if it's profitable to narrow
+ /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+ /// from i32 to i8 but not from i32 to i16.
+ virtual bool isNarrowingProfitable(MVT VT1, MVT VT2) const;
+
+ /// isShuffleMaskLegal - Targets can use this to indicate that they only
+ /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+ /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+ /// values are assumed to be legal.
+ virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+ MVT VT) const;
+
+ /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+ /// used by Targets can use this to indicate if there is a suitable
+ /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+ /// pool entry.
+ virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ MVT VT) const;
+
+ /// ShouldShrinkFPConstant - If true, then instruction selection should
+ /// seek to shrink the FP constant of the specified type to a smaller type
+ /// in order to save space and / or reduce runtime.
+ virtual bool ShouldShrinkFPConstant(MVT VT) const {
+ // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+ // expensive than a straight movsd. On the other hand, it's important to
+ // shrink long double fp constant since fldt is very slow.
+ return !X86ScalarSSEf64 || VT == MVT::f80;
+ }
+
+ /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+ /// for tail call optimization. Target which want to do tail call
+ /// optimization should implement this function.
+ virtual bool IsEligibleForTailCallOptimization(CallSDNode *TheCall,
+ SDValue Ret,
+ SelectionDAG &DAG) const;
+
+ virtual const X86Subtarget* getSubtarget() {
+ return Subtarget;
+ }
+
+ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+ /// computed in an SSE register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(MVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ /// getWidenVectorType: given a vector type, returns the type to widen
+ /// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
+ /// If there is no vector type that we want to widen to, returns MVT::Other
+ /// When and were to widen is target dependent based on the cost of
+ /// scalarizing vs using the wider vector type.
+ virtual MVT getWidenVectorType(MVT VT) const;
+
+ /// createFastISel - This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ virtual FastISel *
+ createFastISel(MachineFunction &mf,
+ MachineModuleInfo *mmi, DwarfWriter *dw,
+ DenseMap<const Value *, unsigned> &,
+ DenseMap<const BasicBlock *, MachineBasicBlock *> &,
+ DenseMap<const AllocaInst *, int> &
+#ifndef NDEBUG
+ , SmallSet<Instruction*, 8> &
+#endif
+ );
+
+ private:
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+ const X86RegisterInfo *RegInfo;
+ const TargetData *TD;
+
+ /// X86StackPtr - X86 physical register used as stack ptr.
+ unsigned X86StackPtr;
+
+ /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+ /// floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf32;
+ bool X86ScalarSSEf64;
+
+ SDNode *LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
+ unsigned CallingConv, SelectionDAG &DAG);
+
+ SDValue LowerMemArgument(SDValue Op, SelectionDAG &DAG,
+ const CCValAssign &VA, MachineFrameInfo *MFI,
+ unsigned CC, SDValue Root, unsigned i);
+
+ SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
+ const SDValue &StackPtr,
+ const CCValAssign &VA, SDValue Chain,
+ SDValue Arg, ISD::ArgFlagsTy Flags);
+
+ // Call lowering helpers.
+ bool IsCalleePop(bool isVarArg, unsigned CallingConv);
+ bool CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall);
+ bool CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall);
+ SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+ SDValue Chain, bool IsTailCall, bool Is64Bit,
+ int FPDiff, DebugLoc dl);
+
+ CCAssignFn *CCAssignFnForNode(unsigned CallingConv) const;
+ NameDecorationStyle NameDecorationForFORMAL_ARGUMENTS(SDValue Op);
+ unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG);
+
+ std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool isSigned);
+
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+ int64_t Offset, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerShift(SDValue Op, SelectionDAG &DAG);
+ SDValue BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain, SDValue StackSlot,
+ SelectionDAG &DAG);
+ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerFABS(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerCALL(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerRET(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG);
+
+ SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG);
+ SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG);
+
+ void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG, unsigned NewOp);
+
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ const Value *DstSV, uint64_t DstSVOff);
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool AlwaysInline,
+ const Value *DstSV, uint64_t DstSVOff,
+ const Value *SrcSV, uint64_t SrcSVOff);
+
+ /// Utility function to emit atomic bitwise operations (and, or, xor).
+ // It takes the bitwise instruction to expand, the associated machine basic
+ // block, and the associated X86 opcodes for reg/reg and reg/imm.
+ MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
+ MachineInstr *BInstr,
+ MachineBasicBlock *BB,
+ unsigned regOpc,
+ unsigned immOpc,
+ unsigned loadOpc,
+ unsigned cxchgOpc,
+ unsigned copyOpc,
+ unsigned notOpc,
+ unsigned EAXreg,
+ TargetRegisterClass *RC,
+ bool invSrc = false) const;
+
+ MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
+ MachineInstr *BInstr,
+ MachineBasicBlock *BB,
+ unsigned regOpcL,
+ unsigned regOpcH,
+ unsigned immOpcL,
+ unsigned immOpcH,
+ bool invSrc = false) const;
+
+ /// Utility function to emit atomic min and max. It takes the min/max
+ /// instruction to expand, the associated basic block, and the associated
+ /// cmov opcode for moving the min or max value.
+ MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
+ MachineBasicBlock *BB,
+ unsigned cmovOpc) const;
+
+ /// Emit nodes that will be selected as "test Op0,Op0", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG);
+
+ /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ SelectionDAG &DAG);
+ };
+
+ namespace X86 {
+ FastISel *createFastISel(MachineFunction &mf,
+ MachineModuleInfo *mmi, DwarfWriter *dw,
+ DenseMap<const Value *, unsigned> &,
+ DenseMap<const BasicBlock *, MachineBasicBlock *> &,
+ DenseMap<const AllocaInst *, int> &
+#ifndef NDEBUG
+ , SmallSet<Instruction*, 8> &
+#endif
+ );
+ }
+}
+
+#endif // X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
new file mode 100644
index 0000000..dc15e4a
--- /dev/null
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -0,0 +1,1937 @@
+//====- X86Instr64bit.td - Describe X86-64 Instructions ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86-64 instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm : Operand<i64>;
+// 64-bits but only 8 bits are significant.
+def i64i8imm : Operand<i64>;
+
+def lea64mem : Operand<i64> {
+ let PrintMethod = "printlea64mem";
+ let MIOperandInfo = (ops GR64, i8imm, GR64, i32imm);
+}
+
+def lea64_32mem : Operand<i32> {
+ let PrintMethod = "printlea64_32mem";
+ let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Complex Pattern Definitions.
+//
+def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr",
+ [add, mul, X86mul_imm, shl, or, frameindex, X86Wrapper],
+ []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+def i64immSExt8 : PatLeaf<(i64 imm), [{
+ // i64immSExt8 predicate - True if the 64-bit immediate fits in a 8-bit
+ // sign extended field.
+ return (int64_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}]>;
+
+def i64immSExt32 : PatLeaf<(i64 imm), [{
+ // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+ // sign extended field.
+ return (int64_t)N->getZExtValue() == (int32_t)N->getZExtValue();
+}]>;
+
+def i64immZExt32 : PatLeaf<(i64 imm), [{
+ // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+ // unsignedsign extended field.
+ return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue();
+}]>;
+
+def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+ "#ADJCALLSTACKDOWN",
+ [(X86callseq_start timm:$amt)]>,
+ Requires<[In64BitMode]>;
+def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[In64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. RSP is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [RSP] in {
+
+ // NOTE: this pattern doesn't match "X86call imm", because we do not know
+ // that the offset between an arbitrary immediate and the call will fit in
+ // the 32-bit pcrel field that we have.
+ def CALL64pcrel32 : I<0xE8, RawFrm,
+ (outs), (ins i64i32imm:$dst, variable_ops),
+ "call\t${dst:call}", []>,
+ Requires<[In64BitMode]>;
+ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+ "call\t{*}$dst", [(X86call GR64:$dst)]>;
+ def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
+ "call\t{*}$dst", [(X86call (loadi64 addr:$dst))]>;
+ }
+
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNdi64 : I<0, Pseudo, (outs), (ins i64imm:$dst, i32imm:$offset,
+ variable_ops),
+ "#TC_RETURN $dst $offset",
+ []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNri64 : I<0, Pseudo, (outs), (ins GR64:$dst, i32imm:$offset,
+ variable_ops),
+ "#TC_RETURN $dst $offset",
+ []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+ def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins GR64:$dst),
+ "jmp{q}\t{*}$dst # TAILCALL",
+ []>;
+
+// Branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+ [(brind GR64:$dst)]>;
+ def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+ [(brind (loadi64 addr:$dst))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1 in {
+def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR64:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions...
+//
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
+def LEAVE64 : I<0xC9, RawFrm,
+ (outs), (ins), "leave", []>;
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
+let mayLoad = 1 in
+def POP64r : I<0x58, AddRegFrm,
+ (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
+let mayStore = 1 in
+def PUSH64r : I<0x50, AddRegFrm,
+ (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1 in
+def POPFQ : I<0x9D, RawFrm, (outs), (ins), "popf", []>, REX_W;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1 in
+def PUSHFQ : I<0x9C, RawFrm, (outs), (ins), "pushf", []>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins lea64_32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+ "lea{q}\t{$src|$dst}, {$dst|$src}",
+ [(set GR64:$dst, lea64addr:$src)]>;
+
+let isTwoAddress = 1 in
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+ "bswap{q}\t$dst",
+ [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (X86bsf GR64:$src)), (implicit EFLAGS)]>, TB;
+def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (X86bsf (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, TB;
+
+def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (X86bsr GR64:$src)), (implicit EFLAGS)]>, TB;
+def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (X86bsr (loadi64 addr:$src))),
+ (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+// Repeat string ops
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI] in
+def REP_MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+ [(X86rep_movs i64)]>, REP;
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI] in
+def REP_STOSQ : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+ [(X86rep_stos i64)]>, REP;
+
+//===----------------------------------------------------------------------===//
+// Move Instructions...
+//
+
+let neverHasSideEffects = 1 in
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+ "movabs{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, imm:$src)]>;
+def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, i64immSExt32:$src)]>;
+}
+
+let canFoldAsLoad = 1 in
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (load addr:$src))]>;
+
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store GR64:$src, addr:$dst)]>;
+def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store i64immSExt32:$src, addr:$dst)]>;
+
+// Sign/Zero extenders
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR8:$src))]>, TB;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR16:$src))]>, TB;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR32:$src))]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))]>;
+
+// Use movzbl instead of movzbq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+ "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, (zext GR8:$src))]>, TB;
+def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+ "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
+// Use movzwl instead of movzwq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movz{wl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, (zext GR16:$src))]>, TB;
+def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movz{wl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
+
+// There's no movzlq instruction, but movl can be used for this purpose, using
+// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
+// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
+// zero-extension, however this isn't possible when the 32-bit value is
+// defined by a truncate or is copied from something where the high bits aren't
+// necessarily all zero. In such cases, we fall back to these explicit zext
+// instructions.
+def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, (zext GR32:$src))]>;
+def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
+// be copying from a truncate, but any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+ return N->getOpcode() != ISD::TRUNCATE &&
+ N->getOpcode() != TargetInstrInfo::EXTRACT_SUBREG &&
+ N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, x86_subreg_32bit)>;
+
+let neverHasSideEffects = 1 in {
+ let Defs = [RAX], Uses = [EAX] in
+ def CDQE : RI<0x98, RawFrm, (outs), (ins),
+ "{cltq|cdqe}", []>; // RAX = signext(EAX)
+
+ let Defs = [RAX,RDX], Uses = [RAX] in
+ def CQO : RI<0x99, RawFrm, (outs), (ins),
+ "{cqto|cqo}", []>; // RDX:RAX = signext(RAX)
+}
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions...
+//
+
+let Defs = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isConvertibleToThreeAddress = 1 in {
+let isCommutable = 1 in
+// Register-Register Addition
+def ADD64rr : RI<0x01, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (add GR64:$src1, GR64:$src2)),
+ (implicit EFLAGS)]>;
+
+// Register-Integer Addition
+def ADD64ri8 : RIi8<0x83, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+def ADD64ri32 : RIi32<0x81, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2)),
+ (implicit EFLAGS)]>;
+} // isConvertibleToThreeAddress
+
+// Register-Memory Addition
+def ADD64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (add GR64:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+} // isTwoAddress
+
+// Memory-Register Addition
+def ADD64mr : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR64:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+def ADD64mi8 : RIi8<0x83, MRM0m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i64immSExt8:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+def ADD64mi32 : RIi32<0x81, MRM0m, (outs), (ins i64mem:$dst, i64i32imm :$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i64immSExt32:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+
+let Uses = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def ADC64rr : RI<0x11, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, GR64:$src2))]>;
+
+def ADC64rm : RI<0x13, MRMSrcMem , (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, (load addr:$src2)))]>;
+
+def ADC64ri8 : RIi8<0x83, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, i64immSExt8:$src2))]>;
+def ADC64ri32 : RIi32<0x81, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def ADC64mr : RI<0x11, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADC64mi8 : RIi8<0x83, MRM2m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+def ADC64mi32 : RIi32<0x81, MRM2m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+} // Uses = [EFLAGS]
+
+let isTwoAddress = 1 in {
+// Register-Register Subtraction
+def SUB64rr : RI<0x29, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sub GR64:$src1, GR64:$src2)),
+ (implicit EFLAGS)]>;
+
+// Register-Memory Subtraction
+def SUB64rm : RI<0x2B, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+
+// Register-Integer Subtraction
+def SUB64ri8 : RIi8<0x83, MRM5r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i8imm:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+def SUB64ri32 : RIi32<0x81, MRM5r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i32imm:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2)),
+ (implicit EFLAGS)]>;
+} // isTwoAddress
+
+// Memory-Register Subtraction
+def SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR64:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// Memory-Integer Subtraction
+def SUB64mi8 : RIi8<0x83, MRM5m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i64immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>;
+def SUB64mi32 : RIi32<0x81, MRM5m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i64immSExt32:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>;
+
+let Uses = [EFLAGS] in {
+let isTwoAddress = 1 in {
+def SBB64rr : RI<0x19, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, GR64:$src2))]>;
+
+def SBB64rm : RI<0x1B, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, (load addr:$src2)))]>;
+
+def SBB64ri8 : RIi8<0x83, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, i64immSExt8:$src2))]>;
+def SBB64ri32 : RIi32<0x81, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def SBB64mr : RI<0x19, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SBB64mi8 : RIi8<0x83, MRM3m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+def SBB64mi32 : RIi32<0x81, MRM3m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+} // Uses = [EFLAGS]
+} // Defs = [EFLAGS]
+
+// Unsigned multiplication
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in {
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+ "mul{q}\t$src", []>; // RAX,RDX = RAX*GR64
+let mayLoad = 1 in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+ "mul{q}\t$src", []>; // RAX,RDX = RAX*[mem64]
+
+// Signed multiplication
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src),
+ "imul{q}\t$src", []>; // RAX,RDX = RAX*GR64
+let mayLoad = 1 in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+ "imul{q}\t$src", []>; // RAX,RDX = RAX*[mem64]
+}
+
+let Defs = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+// Register-Register Signed Integer Multiplication
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (mul GR64:$src1, GR64:$src2)),
+ (implicit EFLAGS)]>, TB;
+
+// Register-Memory Signed Integer Multiplication
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>, TB;
+} // isTwoAddress
+
+// Suprisingly enough, these are not two address instructions!
+
+// Register-Integer Signed Integer Multiplication
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2)),
+ (implicit EFLAGS)]>;
+
+// Memory-Integer Signed Integer Multiplication
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
+ (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, (mul (load addr:$src1),
+ i64immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32
+ (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, (mul (load addr:$src1),
+ i64immSExt32:$src2)),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Unsigned division / remainder
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in {
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src), // RDX:RAX/r64 = RAX,RDX
+ "div{q}\t$src", []>;
+// Signed division / remainder
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src), // RDX:RAX/r64 = RAX,RDX
+ "idiv{q}\t$src", []>;
+let mayLoad = 1 in {
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src), // RDX:RAX/[mem64] = RAX,RDX
+ "div{q}\t$src", []>;
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src), // RDX:RAX/[mem64] = RAX,RDX
+ "idiv{q}\t$src", []>;
+}
+}
+
+// Unary instructions
+let Defs = [EFLAGS], CodeSize = 2 in {
+let isTwoAddress = 1 in
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src), "neg{q}\t$dst",
+ [(set GR64:$dst, (ineg GR64:$src)),
+ (implicit EFLAGS)]>;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+ [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src), "inc{q}\t$dst",
+ [(set GR64:$dst, (add GR64:$src, 1)),
+ (implicit EFLAGS)]>;
+def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src), "dec{q}\t$dst",
+ [(set GR64:$dst, (add GR64:$src, -1)),
+ (implicit EFLAGS)]>;
+def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src), "inc{w}\t$dst",
+ [(set GR16:$dst, (add GR16:$src, 1)),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src), "inc{l}\t$dst",
+ [(set GR32:$dst, (add GR32:$src, 1)),
+ (implicit EFLAGS)]>,
+ Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src), "dec{w}\t$dst",
+ [(set GR16:$dst, (add GR16:$src, -1)),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src), "dec{l}\t$dst",
+ [(set GR32:$dst, (add GR32:$src, -1)),
+ (implicit EFLAGS)]>,
+ Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress
+
+// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
+// how to unfold them.
+let isTwoAddress = 0, CodeSize = 2 in {
+ def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In64BitMode]>;
+ def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>,
+ Requires<[In64BitMode]>;
+ def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In64BitMode]>;
+ def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>,
+ Requires<[In64BitMode]>;
+}
+} // Defs = [EFLAGS], CodeSize
+
+
+let Defs = [EFLAGS] in {
+// Shift instructions
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src),
+ "shl{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (shl GR64:$src, CL))]>;
+let isConvertibleToThreeAddress = 1 in // Can transform into LEA.
+def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+ "shl{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
+// NOTE: We don't use shifts of a register by one, because 'add reg,reg' is
+// cheaper.
+} // isTwoAddress
+
+let Uses = [CL] in
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "shl{q}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t$dst",
+ [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src),
+ "shr{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (srl GR64:$src, CL))]>;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+ "shr{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
+def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t$dst",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "shr{q}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t$dst",
+ [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src),
+ "sar{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (sra GR64:$src, CL))]>;
+def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+ "sar{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
+def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t$dst",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "sar{q}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t$dst",
+ [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Rotate instructions
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src),
+ "rol{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (rotl GR64:$src, CL))]>;
+def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+ "rol{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t$dst",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def ROL64mCL : I<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "rol{q}\t{$src, $dst|$dst, $src}",
+ [(store (rotl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t$dst",
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src),
+ "ror{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (rotr GR64:$src, CL))]>;
+def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+ "ror{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
+def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t$dst",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>;
+def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "ror{q}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t$dst",
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Double shift instructions (generalizations of rotate)
+let isTwoAddress = 1 in {
+let Uses = [CL] in {
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>, TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>, TB;
+}
+
+let isCommutable = 1 in { // FIXME: Update X86InstrInfo::commuteInstruction
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+} // isCommutable
+} // isTwoAddress
+
+let Uses = [CL] in {
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)]>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)]>, TB;
+}
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Logical Instructions...
+//
+
+let isTwoAddress = 1 , AddedComplexity = 15 in
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src), "not{q}\t$dst",
+ [(set GR64:$dst, (not GR64:$src))]>;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+ [(store (not (loadi64 addr:$dst)), addr:$dst)]>;
+
+let Defs = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def AND64rr : RI<0x21, MRMDestReg,
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "and{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (and GR64:$src1, GR64:$src2)),
+ (implicit EFLAGS)]>;
+def AND64rm : RI<0x23, MRMSrcMem,
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "and{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (and GR64:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+def AND64ri8 : RIi8<0x83, MRM4r,
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "and{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+def AND64ri32 : RIi32<0x81, MRM4r,
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "and{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2)),
+ (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def AND64mr : RI<0x21, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src),
+ "and{q}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR64:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def AND64mi8 : RIi8<0x83, MRM4m,
+ (outs), (ins i64mem:$dst, i64i8imm :$src),
+ "and{q}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), i64immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def AND64mi32 : RIi32<0x81, MRM4m,
+ (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "and{q}\t{$src, $dst|$dst, $src}",
+ [(store (and (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def OR64rr : RI<0x09, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "or{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (or GR64:$src1, GR64:$src2)),
+ (implicit EFLAGS)]>;
+def OR64rm : RI<0x0B, MRMSrcMem , (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "or{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (or GR64:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+def OR64ri8 : RIi8<0x83, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "or{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+def OR64ri32 : RIi32<0x81, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "or{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2)),
+ (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def OR64mr : RI<0x09, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "or{q}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR64:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def OR64mi8 : RIi8<0x83, MRM1m, (outs), (ins i64mem:$dst, i64i8imm:$src),
+ "or{q}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), i64immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def OR64mi32 : RIi32<0x81, MRM1m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "or{q}\t{$src, $dst|$dst, $src}",
+ [(store (or (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def XOR64rr : RI<0x31, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "xor{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (xor GR64:$src1, GR64:$src2)),
+ (implicit EFLAGS)]>;
+def XOR64rm : RI<0x33, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "xor{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+def XOR64ri8 : RIi8<0x83, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "xor{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+def XOR64ri32 : RIi32<0x81, MRM6r,
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "xor{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2)),
+ (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def XOR64mr : RI<0x31, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "xor{q}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR64:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def XOR64mi8 : RIi8<0x83, MRM6m, (outs), (ins i64mem:$dst, i64i8imm :$src),
+ "xor{q}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), i64immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def XOR64mi32 : RIi32<0x81, MRM6m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "xor{q}\t{$src, $dst|$dst, $src}",
+ [(store (xor (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Comparison Instructions...
+//
+
+// Integer comparison
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in
+def TEST64rr : RI<0x85, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "test{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR64:$src1, GR64:$src2), 0),
+ (implicit EFLAGS)]>;
+def TEST64rm : RI<0x85, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+ "test{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR64:$src1, (loadi64 addr:$src2)), 0),
+ (implicit EFLAGS)]>;
+def TEST64ri32 : RIi32<0xF7, MRM0r, (outs),
+ (ins GR64:$src1, i64i32imm:$src2),
+ "test{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR64:$src1, i64immSExt32:$src2), 0),
+ (implicit EFLAGS)]>;
+def TEST64mi32 : RIi32<0xF7, MRM0m, (outs),
+ (ins i64mem:$src1, i64i32imm:$src2),
+ "test{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and (loadi64 addr:$src1), i64immSExt32:$src2), 0),
+ (implicit EFLAGS)]>;
+
+def CMP64rr : RI<0x39, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR64:$src1, GR64:$src2),
+ (implicit EFLAGS)]>;
+def CMP64mr : RI<0x39, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi64 addr:$src1), GR64:$src2),
+ (implicit EFLAGS)]>;
+def CMP64rm : RI<0x3B, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR64:$src1, (loadi64 addr:$src2)),
+ (implicit EFLAGS)]>;
+def CMP64ri8 : RIi8<0x83, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR64:$src1, i64immSExt8:$src2),
+ (implicit EFLAGS)]>;
+def CMP64ri32 : RIi32<0x81, MRM7r, (outs), (ins GR64:$src1, i64i32imm:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR64:$src1, i64immSExt32:$src2),
+ (implicit EFLAGS)]>;
+def CMP64mi8 : RIi8<0x83, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi64 addr:$src1), i64immSExt8:$src2),
+ (implicit EFLAGS)]>;
+def CMP64mi32 : RIi32<0x81, MRM7m, (outs),
+ (ins i64mem:$src1, i64i32imm:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi64 addr:$src1), i64immSExt32:$src2),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Bit tests.
+// TODO: BTC, BTR, and BTS
+let Defs = [EFLAGS] in {
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86bt GR64:$src1, GR64:$src2),
+ (implicit EFLAGS)]>, TB;
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Disable these instructions for now.
+//def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+// "bt{q}\t{$src2, $src1|$src1, $src2}",
+// [(X86bt (loadi64 addr:$src1), GR64:$src2),
+// (implicit EFLAGS)]>, TB;
+
+def BT64ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86bt GR64:$src1, i64immSExt8:$src2),
+ (implicit EFLAGS)]>, TB;
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+def BT64mi8 : Ii8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(X86bt (loadi64 addr:$src1), i64immSExt8:$src2),
+ (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+// Conditional moves
+let Uses = [EFLAGS], isTwoAddress = 1 in {
+let isCommutable = 1 in {
+def CMOVB64rr : RI<0x42, MRMSrcReg, // if <u, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovb\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_B, EFLAGS))]>, TB;
+def CMOVAE64rr: RI<0x43, MRMSrcReg, // if >=u, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovae\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_AE, EFLAGS))]>, TB;
+def CMOVE64rr : RI<0x44, MRMSrcReg, // if ==, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmove\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_E, EFLAGS))]>, TB;
+def CMOVNE64rr: RI<0x45, MRMSrcReg, // if !=, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovne\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NE, EFLAGS))]>, TB;
+def CMOVBE64rr: RI<0x46, MRMSrcReg, // if <=u, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovbe\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_BE, EFLAGS))]>, TB;
+def CMOVA64rr : RI<0x47, MRMSrcReg, // if >u, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmova\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_A, EFLAGS))]>, TB;
+def CMOVL64rr : RI<0x4C, MRMSrcReg, // if <s, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovl\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_L, EFLAGS))]>, TB;
+def CMOVGE64rr: RI<0x4D, MRMSrcReg, // if >=s, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovge\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_GE, EFLAGS))]>, TB;
+def CMOVLE64rr: RI<0x4E, MRMSrcReg, // if <=s, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovle\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_LE, EFLAGS))]>, TB;
+def CMOVG64rr : RI<0x4F, MRMSrcReg, // if >s, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovg\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_G, EFLAGS))]>, TB;
+def CMOVS64rr : RI<0x48, MRMSrcReg, // if signed, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovs\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_S, EFLAGS))]>, TB;
+def CMOVNS64rr: RI<0x49, MRMSrcReg, // if !signed, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovns\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NS, EFLAGS))]>, TB;
+def CMOVP64rr : RI<0x4A, MRMSrcReg, // if parity, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovp\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_P, EFLAGS))]>, TB;
+def CMOVNP64rr : RI<0x4B, MRMSrcReg, // if !parity, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovnp\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NP, EFLAGS))]>, TB;
+def CMOVO64rr : RI<0x40, MRMSrcReg, // if overflow, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovo\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_O, EFLAGS))]>, TB;
+def CMOVNO64rr : RI<0x41, MRMSrcReg, // if !overflow, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovno\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NO, EFLAGS))]>, TB;
+} // isCommutable = 1
+
+def CMOVB64rm : RI<0x42, MRMSrcMem, // if <u, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovb\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_B, EFLAGS))]>, TB;
+def CMOVAE64rm: RI<0x43, MRMSrcMem, // if >=u, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovae\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_AE, EFLAGS))]>, TB;
+def CMOVE64rm : RI<0x44, MRMSrcMem, // if ==, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmove\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_E, EFLAGS))]>, TB;
+def CMOVNE64rm: RI<0x45, MRMSrcMem, // if !=, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovne\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NE, EFLAGS))]>, TB;
+def CMOVBE64rm: RI<0x46, MRMSrcMem, // if <=u, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovbe\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_BE, EFLAGS))]>, TB;
+def CMOVA64rm : RI<0x47, MRMSrcMem, // if >u, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmova\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_A, EFLAGS))]>, TB;
+def CMOVL64rm : RI<0x4C, MRMSrcMem, // if <s, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovl\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_L, EFLAGS))]>, TB;
+def CMOVGE64rm: RI<0x4D, MRMSrcMem, // if >=s, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovge\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_GE, EFLAGS))]>, TB;
+def CMOVLE64rm: RI<0x4E, MRMSrcMem, // if <=s, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovle\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_LE, EFLAGS))]>, TB;
+def CMOVG64rm : RI<0x4F, MRMSrcMem, // if >s, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovg\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_G, EFLAGS))]>, TB;
+def CMOVS64rm : RI<0x48, MRMSrcMem, // if signed, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovs\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_S, EFLAGS))]>, TB;
+def CMOVNS64rm: RI<0x49, MRMSrcMem, // if !signed, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovns\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NS, EFLAGS))]>, TB;
+def CMOVP64rm : RI<0x4A, MRMSrcMem, // if parity, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovp\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_P, EFLAGS))]>, TB;
+def CMOVNP64rm : RI<0x4B, MRMSrcMem, // if !parity, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovnp\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NP, EFLAGS))]>, TB;
+def CMOVO64rm : RI<0x40, MRMSrcMem, // if overflow, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovo\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_O, EFLAGS))]>, TB;
+def CMOVNO64rm : RI<0x41, MRMSrcMem, // if !overflow, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovno\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NO, EFLAGS))]>, TB;
+} // isTwoAddress
+
+//===----------------------------------------------------------------------===//
+// Conversion Instructions...
+//
+
+// f64 -> signed i64
+def Int_CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse2_cvtsd2si64 VR128:$src))]>;
+def Int_CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f128mem:$src),
+ "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (int_x86_sse2_cvtsd2si64
+ (load addr:$src)))]>;
+def CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
+ "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f64mem:$src),
+ "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def Int_CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse2_cvttsd2si64 VR128:$src))]>;
+def Int_CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f128mem:$src),
+ "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse2_cvttsd2si64
+ (load addr:$src)))]>;
+
+// Signed i64 -> f64
+def CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
+ "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsi642sd VR128:$src1,
+ GR64:$src2))]>;
+def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+ "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsi642sd VR128:$src1,
+ (loadi64 addr:$src2)))]>;
+} // isTwoAddress
+
+// Signed i64 -> f32
+def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR64:$src),
+ "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i64mem:$src),
+ "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+ def Int_CVTSI2SS64rr : RSSI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
+ "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse_cvtsi642ss VR128:$src1,
+ GR64:$src2))]>;
+ def Int_CVTSI2SS64rm : RSSI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+ "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse_cvtsi642ss VR128:$src1,
+ (loadi64 addr:$src2)))]>;
+}
+
+// f32 -> signed i64
+def Int_CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse_cvtss2si64 VR128:$src))]>;
+def Int_CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+ "cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (int_x86_sse_cvtss2si64
+ (load addr:$src)))]>;
+def CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
+ "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+ "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def Int_CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse_cvttss2si64 VR128:$src))]>;
+def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+ "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse_cvttss2si64 (load addr:$src)))]>;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor. Use xorl instead of xorq; it's
+// equivalent due to implicit zero-extending, and it sometimes has a smaller
+// encoding.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+// FIXME: AddedComplexity gives MOV64r0 a higher priority than MOV64ri32. Remove
+// when we have a better way to specify isel priority.
+let Defs = [EFLAGS], AddedComplexity = 1,
+ isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins),
+ "xor{l}\t${dst:subreg32}, ${dst:subreg32}",
+ [(set GR64:$dst, 0)]>;
+
+// Materialize i64 constant where top 32-bits are zero.
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
+ "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, i64immZExt32:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//===----------------------------------------------------------------------===//
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [RSP] in
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64imm:$sym),
+ ".byte\t0x66; "
+ "leaq\t${sym:mem}(%rip), %rdi; "
+ ".word\t0x6666; "
+ "rex64; "
+ "call\t__tls_get_addr@PLT",
+ [(X86tlsaddr tglobaltlsaddr:$sym)]>,
+ Requires<[In64BitMode]>;
+
+let AddedComplexity = 5 in
+def MOV64GSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "movq\t%gs:$src, $dst",
+ [(set GR64:$dst, (gsload addr:$src))]>, SegGS;
+
+let AddedComplexity = 5 in
+def MOV64FSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "movq\t%fs:$src, $dst",
+ [(set GR64:$dst, (fsload addr:$src))]>, SegFS;
+
+//===----------------------------------------------------------------------===//
+// Atomic Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [RAX, EFLAGS], Uses = [RAX] in {
+def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
+ "lock\n\t"
+ "cmpxchgq\t$swap,$ptr",
+ [(X86cas addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
+}
+
+let Constraints = "$val = $dst" in {
+let Defs = [EFLAGS] in
+def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$ptr,GR64:$val),
+ "lock\n\t"
+ "xadd\t$val, $ptr",
+ [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>,
+ TB, LOCK;
+def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$ptr,GR64:$val),
+ "xchg\t$val, $ptr",
+ [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>;
+}
+
+// Atomic exchange, and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+ usesCustomDAGSchedInserter = 1 in {
+def ATOMAND64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMAND64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_and_64 addr:$ptr, GR64:$val))]>;
+def ATOMOR64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMOR64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_or_64 addr:$ptr, GR64:$val))]>;
+def ATOMXOR64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMXOR64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_xor_64 addr:$ptr, GR64:$val))]>;
+def ATOMNAND64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMNAND64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_nand_64 addr:$ptr, GR64:$val))]>;
+def ATOMMIN64: I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$ptr, GR64:$val),
+ "#ATOMMIN64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_min_64 addr:$ptr, GR64:$val))]>;
+def ATOMMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMMAX64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_max_64 addr:$ptr, GR64:$val))]>;
+def ATOMUMIN64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMUMIN64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_umin_64 addr:$ptr, GR64:$val))]>;
+def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMUMAX64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri tconstpool :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri tjumptable :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri tglobaladdr :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri texternalsym:$dst)>, Requires<[NotSmallCode]>;
+
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tconstpool:$src)>,
+ Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tjumptable:$src)>,
+ Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+ Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, texternalsym:$src)>,
+ Requires<[SmallCode, IsStatic]>;
+
+// Calls
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+ (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86tailcall (i64 texternalsym:$dst)),
+ (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall GR64:$dst),
+ (CALL64r GR64:$dst)>;
+
+
+// tailcall stuff
+def : Pat<(X86tailcall GR32:$dst),
+ (TAILCALL)>;
+def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
+ (TAILCALL)>;
+def : Pat<(X86tailcall (i64 texternalsym:$dst)),
+ (TAILCALL)>;
+
+def : Pat<(X86tcret GR64:$dst, imm:$off),
+ (TCRETURNri64 GR64:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi64 texternalsym:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+ (TCRETURNdi64 texternalsym:$dst, imm:$off)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(parallel (X86cmp GR64:$src1, 0), (implicit EFLAGS)),
+ (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_B, EFLAGS),
+ (CMOVAE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_AE, EFLAGS),
+ (CMOVB64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_E, EFLAGS),
+ (CMOVNE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NE, EFLAGS),
+ (CMOVE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_BE, EFLAGS),
+ (CMOVA64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_A, EFLAGS),
+ (CMOVBE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_L, EFLAGS),
+ (CMOVGE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_GE, EFLAGS),
+ (CMOVL64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_LE, EFLAGS),
+ (CMOVG64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_G, EFLAGS),
+ (CMOVLE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_P, EFLAGS),
+ (CMOVNP64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NP, EFLAGS),
+ (CMOVP64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_S, EFLAGS),
+ (CMOVNS64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NS, EFLAGS),
+ (CMOVS64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_O, EFLAGS),
+ (CMOVNO64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NO, EFLAGS),
+ (CMOVO64rm GR64:$src2, addr:$src1)>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+
+// extload
+// When extloading from 16-bit and smaller memory locations into 64-bit registers,
+// use zero-extending loads so that the entire 64-bit register is defined, avoiding
+// partial-register updates.
+def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i32 addr:$src),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (MOV32rm addr:$src),
+ x86_subreg_32bit)>;
+def : Pat<(extloadi16i1 addr:$src),
+ (INSERT_SUBREG (i16 (IMPLICIT_DEF)), (MOV8rm addr:$src),
+ x86_subreg_8bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(extloadi16i8 addr:$src),
+ (INSERT_SUBREG (i16 (IMPLICIT_DEF)), (MOV8rm addr:$src),
+ x86_subreg_8bit)>,
+ Requires<[In64BitMode]>;
+
+// anyext
+def : Pat<(i64 (anyext GR8:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src, x86_subreg_8bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR16:$src, x86_subreg_16bit)>;
+def : Pat<(i64 (anyext GR32:$src)),
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, x86_subreg_32bit)>;
+def : Pat<(i16 (anyext GR8:$src)),
+ (INSERT_SUBREG (i16 (IMPLICIT_DEF)), GR8:$src, x86_subreg_8bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext GR8:$src)),
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, x86_subreg_8bit)>,
+ Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR64:$src1, 128),
+ (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+ (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+ (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+ (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+ (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+ (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+ (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, x86_subreg_8bit))>,
+ Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (MOVZX16rr8 (i8 (EXTRACT_SUBREG GR16:$src1, x86_subreg_8bit)))>,
+ Requires<[In64BitMode]>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR64:$src, i32),
+ (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+ (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+ (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit))>,
+ Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+ (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)))>,
+ Requires<[In64BitMode]>;
+
+// trunc patterns
+def : Pat<(i32 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)>,
+ Requires<[In64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR64:$src, GR64_ABCD),
+ x86_subreg_8bit_hi)),
+ x86_subreg_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+ x86_subreg_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(srl_su GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+ x86_subreg_8bit_hi)),
+ x86_subreg_16bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+ x86_subreg_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+ x86_subreg_8bit_hi)),
+ x86_subreg_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR64:$src, GR64_ABCD),
+ x86_subreg_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+ x86_subreg_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+ x86_subreg_8bit_hi))>,
+ Requires<[In64BitMode]>;
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// (shl x (and y, 63)) ==> (shl x, y)
+def : Pat<(shl GR64:$src1, (and CL:$amt, 63)),
+ (SHL64rCL GR64:$src1)>;
+def : Pat<(store (shl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+ (SHL64mCL addr:$dst)>;
+
+def : Pat<(srl GR64:$src1, (and CL:$amt, 63)),
+ (SHR64rCL GR64:$src1)>;
+def : Pat<(store (srl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+ (SHR64mCL addr:$dst)>;
+
+def : Pat<(sra GR64:$src1, (and CL:$amt, 63)),
+ (SAR64rCL GR64:$src1)>;
+def : Pat<(store (sra (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+ (SAR64mCL addr:$dst)>;
+
+// (or (x >> c) | (y << (64 - c))) ==> (shrd64 x, y, c)
+def : Pat<(or (srl GR64:$src1, CL:$amt),
+ (shl GR64:$src2, (sub 64, CL:$amt))),
+ (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (srl (loadi64 addr:$dst), CL:$amt),
+ (shl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+ (SHRD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(or (srl GR64:$src1, (i8 (trunc RCX:$amt))),
+ (shl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+ (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (srl (loadi64 addr:$dst), (i8 (trunc RCX:$amt))),
+ (shl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+ addr:$dst),
+ (SHRD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(shrd GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+ (SHRD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi64 addr:$dst), (i8 imm:$amt1),
+ GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHRD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
+
+// (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+def : Pat<(or (shl GR64:$src1, CL:$amt),
+ (srl GR64:$src2, (sub 64, CL:$amt))),
+ (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (shl (loadi64 addr:$dst), CL:$amt),
+ (srl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+ (SHLD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(or (shl GR64:$src1, (i8 (trunc RCX:$amt))),
+ (srl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+ (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (shl (loadi64 addr:$dst), (i8 (trunc RCX:$amt))),
+ (srl GR64:$src2, (i8 (trunc (sub 64, RCX:$amt))))),
+ addr:$dst),
+ (SHLD64mrCL addr:$dst, GR64:$src2)>;
+
+def : Pat<(shld GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+ (SHLD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi64 addr:$dst), (i8 imm:$amt1),
+ GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHLD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR64:$src1, GR64:$src2),
+ (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(addc GR64:$src1, (load addr:$src2)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, imm:$src2)>;
+
+def : Pat<(subc GR64:$src1, GR64:$src2),
+ (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(subc GR64:$src1, (load addr:$src2)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(subc GR64:$src1, i64immSExt8:$src2),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(subc GR64:$src1, imm:$src2),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// Register-Register Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR64:$src1, GR64:$src2),
+ (implicit EFLAGS)),
+ (ADD64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Integer Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt8:$src2),
+ (implicit EFLAGS)),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt32:$src2),
+ (implicit EFLAGS)),
+ (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Register-Memory Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR64:$src1, (loadi64 addr:$src2)),
+ (implicit EFLAGS)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
+
+// Memory-Register Addition with EFLAGS result
+def : Pat<(parallel (store (X86add_flag (loadi64 addr:$dst), GR64:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD64mr addr:$dst, GR64:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi64 addr:$dst), i64immSExt32:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR64:$src1, GR64:$src2),
+ (implicit EFLAGS)),
+ (SUB64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Memory Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR64:$src1, (loadi64 addr:$src2)),
+ (implicit EFLAGS)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
+
+// Register-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt8:$src2),
+ (implicit EFLAGS)),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt32:$src2),
+ (implicit EFLAGS)),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Memory-Register Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi64 addr:$dst), GR64:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB64mr addr:$dst, GR64:$src2)>;
+
+// Memory-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi64 addr:$dst), i64immSExt32:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR64:$src1, GR64:$src2),
+ (implicit EFLAGS)),
+ (IMUL64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Memory Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR64:$src1, (loadi64 addr:$src2)),
+ (implicit EFLAGS)),
+ (IMUL64rm GR64:$src1, addr:$src2)>;
+
+// Register-Integer Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt8:$src2),
+ (implicit EFLAGS)),
+ (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt32:$src2),
+ (implicit EFLAGS)),
+ (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Memory-Integer Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt8:$src2),
+ (implicit EFLAGS)),
+ (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt32:$src2),
+ (implicit EFLAGS)),
+ (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// INC and DEC with EFLAGS result. Note that these do not set CF.
+def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
+ (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i16 (X86inc_flag (loadi16 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (INC64_16m addr:$dst)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
+ (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i16 (X86dec_flag (loadi16 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (DEC64_16m addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
+ (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i32 (X86inc_flag (loadi32 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (INC64_32m addr:$dst)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
+ (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i32 (X86dec_flag (loadi32 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (DEC64_32m addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(parallel (X86inc_flag GR64:$src), (implicit EFLAGS)),
+ (INC64r GR64:$src)>;
+def : Pat<(parallel (store (i64 (X86inc_flag (loadi64 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (INC64m addr:$dst)>;
+def : Pat<(parallel (X86dec_flag GR64:$src), (implicit EFLAGS)),
+ (DEC64r GR64:$src)>;
+def : Pat<(parallel (store (i64 (X86dec_flag (loadi64 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (DEC64m addr:$dst)>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE Instructions
+//===----------------------------------------------------------------------===//
+
+// Move instructions...
+
+def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))]>;
+def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+ (iPTR 0)))]>;
+
+def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))]>;
+def MOV64toSDrm : RPDI<0x6E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+
+def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR64:$dst,
+ (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+ addr:$dst)]>, OpSize, REX_W;
+}
+
+defm PEXTRQ : SS41I_extract64<0x16, "pextrq">;
+
+let isTwoAddress = 1 in {
+ multiclass SS41I_insert64<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+ OpSize, REX_W;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+ imm:$src3)))]>, OpSize, REX_W;
+ }
+}
+
+defm PINSRQ : SS41I_insert64<0x22, "pinsrq">;
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..39504cd
--- /dev/null
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,168 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call. X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense. Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly. The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ union {
+ unsigned Reg;
+ int FrameIndex;
+ } Base;
+
+ unsigned Scale;
+ unsigned IndexReg;
+ unsigned Disp;
+ GlobalValue *GV;
+
+ X86AddressMode() : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0) {
+ Base.Reg = 0;
+ }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+inline const MachineInstrBuilder &addDirectMem(const MachineInstrBuilder &MIB,
+ unsigned Reg) {
+ // Because memory references are always represented with four
+ // values, this adds: Reg, [1, NoReg, 0] to the instruction.
+ return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0);
+}
+
+inline const MachineInstrBuilder &addLeaOffset(const MachineInstrBuilder &MIB,
+ int Offset) {
+ return MIB.addImm(1).addReg(0).addImm(Offset);
+}
+
+inline const MachineInstrBuilder &addOffset(const MachineInstrBuilder &MIB,
+ int Offset) {
+ return addLeaOffset(MIB, Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+inline const MachineInstrBuilder &addRegOffset(const MachineInstrBuilder &MIB,
+ unsigned Reg, bool isKill,
+ int Offset) {
+ return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+inline const MachineInstrBuilder &addLeaRegOffset(const MachineInstrBuilder &MIB,
+ unsigned Reg, bool isKill,
+ int Offset) {
+ return addLeaOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+ unsigned Reg1, bool isKill1,
+ unsigned Reg2, bool isKill2) {
+ return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+ .addReg(Reg2, getKillRegState(isKill2)).addImm(0);
+}
+
+inline const MachineInstrBuilder &addLeaAddress(const MachineInstrBuilder &MIB,
+ const X86AddressMode &AM) {
+ assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+ if (AM.BaseType == X86AddressMode::RegBase)
+ MIB.addReg(AM.Base.Reg);
+ else if (AM.BaseType == X86AddressMode::FrameIndexBase)
+ MIB.addFrameIndex(AM.Base.FrameIndex);
+ else
+ assert (0);
+ MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+ if (AM.GV)
+ return MIB.addGlobalAddress(AM.GV, AM.Disp);
+ else
+ return MIB.addImm(AM.Disp);
+}
+
+inline const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+ const X86AddressMode &AM) {
+ return addLeaAddress(MIB, AM).addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+ MachineInstr *MI = MIB;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineFrameInfo &MFI = *MF.getFrameInfo();
+ const TargetInstrDesc &TID = MI->getDesc();
+ unsigned Flags = 0;
+ if (TID.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+ if (TID.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+ MachineMemOperand MMO(PseudoSourceValue::getFixedStack(FI),
+ Flags,
+ MFI.getObjectOffset(FI) + Offset,
+ MFI.getObjectSize(FI),
+ MFI.getObjectAlignment(FI));
+ return addOffset(MIB.addFrameIndex(FI), Offset)
+ .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool. The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+ unsigned GlobalBaseReg = 0) {
+ //FIXME: factor this
+ return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+ .addConstantPoolIndex(CPI).addReg(0);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..bc7def4
--- /dev/null
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,597 @@
+//==- X86InstrFPStack.td - Describe the X86 Instruction Set --*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
+ SDTCisVT<1, f80>]>;
+def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
+ [SDNPHasChain, SDNPMayLoad]>;
+def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
+ [SDNPHasChain, SDNPInFlag, SDNPMayStore]>;
+def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
+ [SDNPHasChain, SDNPMayLoad]>;
+def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+ [SDNPHasChain, SDNPOutFlag, SDNPMayLoad]>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
+ [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
+ def FP32_TO_INT16_IN_MEM : I<0, Pseudo,
+ (outs), (ins i16mem:$dst, RFP32:$src),
+ "##FP32_TO_INT16_IN_MEM PSEUDO!",
+ [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT32_IN_MEM : I<0, Pseudo,
+ (outs), (ins i32mem:$dst, RFP32:$src),
+ "##FP32_TO_INT32_IN_MEM PSEUDO!",
+ [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT64_IN_MEM : I<0, Pseudo,
+ (outs), (ins i64mem:$dst, RFP32:$src),
+ "##FP32_TO_INT64_IN_MEM PSEUDO!",
+ [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+ def FP64_TO_INT16_IN_MEM : I<0, Pseudo,
+ (outs), (ins i16mem:$dst, RFP64:$src),
+ "##FP64_TO_INT16_IN_MEM PSEUDO!",
+ [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT32_IN_MEM : I<0, Pseudo,
+ (outs), (ins i32mem:$dst, RFP64:$src),
+ "##FP64_TO_INT32_IN_MEM PSEUDO!",
+ [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT64_IN_MEM : I<0, Pseudo,
+ (outs), (ins i64mem:$dst, RFP64:$src),
+ "##FP64_TO_INT64_IN_MEM PSEUDO!",
+ [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+ def FP80_TO_INT16_IN_MEM : I<0, Pseudo,
+ (outs), (ins i16mem:$dst, RFP80:$src),
+ "##FP80_TO_INT16_IN_MEM PSEUDO!",
+ [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT32_IN_MEM : I<0, Pseudo,
+ (outs), (ins i32mem:$dst, RFP80:$src),
+ "##FP80_TO_INT32_IN_MEM PSEUDO!",
+ [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT64_IN_MEM : I<0, Pseudo,
+ (outs), (ins i64mem:$dst, RFP80:$src),
+ "##FP80_TO_INT64_IN_MEM PSEUDO!",
+ [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+let isTerminator = 1 in
+ let Defs = [FP0, FP1, FP2, FP3, FP4, FP5, FP6] in
+ def FP_REG_KILL : I<0, Pseudo, (outs), (ins), "##FP_REG_KILL", []>;
+
+// All FP Stack operations are represented with four instructions here. The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values. These sizes apply to the values,
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information. These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler. These use "RST" registers, although frequently
+// the actual register(s) used are implicit. These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// Pseudo Instructions for FP stack return values.
+def FpGET_ST0_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+def FpGET_ST0_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+def FpGET_ST0_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+
+// FpGET_ST1* should only be issued *after* an FpGET_ST0* has been issued when
+// there are two values live out on the stack from a call or inlineasm. This
+// magic is handled by the stackifier. It is not valid to emit FpGET_ST1* and
+// then FpGET_ST0*. In addition, it is invalid for any FP-using operations to
+// occur between them.
+def FpGET_ST1_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+def FpGET_ST1_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+def FpGET_ST1_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+
+let Defs = [ST0] in {
+def FpSET_ST0_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(0) = FPR
+def FpSET_ST0_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(0) = FPR
+def FpSET_ST0_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(0) = FPR
+}
+
+let Defs = [ST1] in {
+def FpSET_ST1_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(1) = FPR
+def FpSET_ST1_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(1) = FPR
+def FpSET_ST1_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(1) = FPR
+}
+
+// FpIf32, FpIf64 - Floating Point Psuedo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Register copies. Just copies, the shortening ones do not truncate.
+let neverHasSideEffects = 1 in {
+ def MOV_Fp3232 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), SpecialFP, []>;
+ def MOV_Fp3264 : FpIf32<(outs RFP64:$dst), (ins RFP32:$src), SpecialFP, []>;
+ def MOV_Fp6432 : FpIf32<(outs RFP32:$dst), (ins RFP64:$src), SpecialFP, []>;
+ def MOV_Fp6464 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), SpecialFP, []>;
+ def MOV_Fp8032 : FpIf32<(outs RFP32:$dst), (ins RFP80:$src), SpecialFP, []>;
+ def MOV_Fp3280 : FpIf32<(outs RFP80:$dst), (ins RFP32:$src), SpecialFP, []>;
+ def MOV_Fp8064 : FpIf64<(outs RFP64:$dst), (ins RFP80:$src), SpecialFP, []>;
+ def MOV_Fp6480 : FpIf64<(outs RFP80:$dst), (ins RFP64:$src), SpecialFP, []>;
+ def MOV_Fp8080 : FpI_ <(outs RFP80:$dst), (ins RFP80:$src), SpecialFP, []>;
+}
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+ [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+ [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+ [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP32:$dst,
+ (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst,
+ (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst,
+ (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+ [(set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src),
+ !strconcat("f", !strconcat(asmstring, "{s}\t$src"))> { let mayLoad = 1; }
+def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src),
+ !strconcat("f", !strconcat(asmstring, "{l}\t$src"))> { let mayLoad = 1; }
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src),
+ !strconcat("fi", !strconcat(asmstring, "{s}\t$src"))> { let mayLoad = 1; }
+def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
+ !strconcat("fi", !strconcat(asmstring, "{l}\t$src"))> { let mayLoad = 1; }
+}
+
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+
+class FPST0rInst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
+class FPrST0Inst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DC;
+class FPrST0PInst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DE;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
+// we have to put some 'r's in and take them out of weird places.
+def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">;
+def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, %ST(0)}">;
+def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">;
+def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">;
+def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
+def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">;
+def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
+def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">;
+def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, %ST(0)}">;
+def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">;
+def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">;
+def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
+def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">;
+def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
+}
+
+defm CHS : FPUnary<fneg, 0xE0, "fchs">;
+defm ABS : FPUnary<fabs, 0xE1, "fabs">;
+defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
+defm SIN : FPUnary<fsin, 0xFE, "fsin">;
+defm COS : FPUnary<fcos, 0xFF, "fcos">;
+
+let neverHasSideEffects = 1 in {
+def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+
+// Floating point cmovs.
+multiclass FPCMov<PatLeaf cc> {
+ def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+ CondMovFP,
+ [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+ cc, EFLAGS))]>;
+ def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+ CondMovFP,
+ [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+ cc, EFLAGS))]>;
+ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+ CondMovFP,
+ [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+ cc, EFLAGS))]>;
+}
+let Uses = [EFLAGS], isTwoAddress = 1 in {
+defm CMOVB : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE : FPCMov<X86_COND_E>;
+defm CMOVP : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+}
+
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovb\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovbe\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmove\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovu\t {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovnb\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovnbe\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovne\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovnu\t{$op, %st(0)|%ST(0), $op}">, DB;
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1, mayHaveSideEffects = 1 in
+ def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+ [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+ [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+ [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+ [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, neverHasSideEffects = 1 in {
+def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1 in {
+def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
+def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
+def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
+def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
+def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
+def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
+}
+let mayStore = 1 in {
+def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
+def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
+def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
+def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
+def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
+def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
+def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP32:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP32:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP32:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP64:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP64:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP64:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP80:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP80:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP80:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+
+let mayStore = 1 in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">;
+}
+
+// FP Stack manipulation instructions.
+def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op">, D9;
+def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op">, DD;
+def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op">, DD;
+def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9;
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm1)]>;
+}
+
+def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz">, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9;
+
+
+// Floating point compares.
+let Defs = [EFLAGS] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ []>; // FPSW = cmp ST(0) with ST(i)
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ []>; // FPSW = cmp ST(0) with ST(i)
+def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ []>; // FPSW = cmp ST(0) with ST(i)
+
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(X86cmp RFP32:$lhs, RFP32:$rhs),
+ (implicit EFLAGS)]>; // CC = ST(0) cmp ST(i)
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(X86cmp RFP64:$lhs, RFP64:$rhs),
+ (implicit EFLAGS)]>; // CC = ST(0) cmp ST(i)
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(X86cmp RFP80:$lhs, RFP80:$rhs),
+ (implicit EFLAGS)]>; // CC = ST(0) cmp ST(i)
+}
+
+let Defs = [EFLAGS], Uses = [ST0] in {
+def UCOM_Fr : FPI<0xE0, AddRegFrm, // FPSW = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg),
+ "fucom\t$reg">, DD;
+def UCOM_FPr : FPI<0xE8, AddRegFrm, // FPSW = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg),
+ "fucomp\t$reg">, DD;
+def UCOM_FPPr : FPI<0xE9, RawFrm, // cmp ST(0) with ST(1), pop, pop
+ (outs), (ins),
+ "fucompp">, DA;
+
+def UCOM_FIr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg),
+ "fucomi\t{$reg, %st(0)|%ST(0), $reg}">, DB;
+def UCOM_FIPr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg),
+ "fucomip\t{$reg, %st(0)|%ST(0), $reg}">, DF;
+}
+
+// Floating point flag ops.
+let Defs = [AX] in
+def FNSTSW8r : I<0xE0, RawFrm, // AX = fp flags
+ (outs), (ins), "fnstsw", []>, DF;
+
+def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
+ (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+ [(X86fp_cwd_get16 addr:$dst)]>;
+
+let mayLoad = 1 in
+def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
+ (outs), (ins i16mem:$dst), "fldcw\t$dst", []>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fextend RFP32:$src)), (MOV_Fp3264 RFP32:$src)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (MOV_Fp3280 RFP32:$src)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (MOV_Fp6480 RFP64:$src)>,
+ Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack. We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fround RFP64:$src)), (MOV_Fp6432 RFP64:$src)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f32 (fround RFP80:$src)), (MOV_Fp8032 RFP80:$src)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f64 (fround RFP80:$src)), (MOV_Fp8064 RFP80:$src)>,
+ Requires<[FPStackf64]>;
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 0000000..eeed5bd
--- /dev/null
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,285 @@
+//===- X86InstrFormats.td - X86 Instruction Formats --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+ bits<6> Value = val;
+}
+
+def Pseudo : Format<0>; def RawFrm : Format<1>;
+def AddRegFrm : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>;
+def MRMSrcMem : Format<6>;
+def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>;
+def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>;
+def MRM6r : Format<22>; def MRM7r : Format<23>;
+def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>;
+def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>;
+def MRM6m : Format<30>; def MRM7m : Format<31>;
+def MRMInitReg : Format<32>;
+
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<3> val> {
+ bits<3> Value = val;
+}
+def NoImm : ImmType<0>;
+def Imm8 : ImmType<1>;
+def Imm16 : ImmType<2>;
+def Imm32 : ImmType<3>;
+def Imm64 : ImmType<4>;
+
+// FPFormat - This specifies what form this FP instruction has. This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+ bits<3> Value = val;
+}
+def NotFP : FPFormat<0>;
+def ZeroArgFP : FPFormat<1>;
+def OneArgFP : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP : FPFormat<4>;
+def CompareFP : FPFormat<5>;
+def CondMovFP : FPFormat<6>;
+def SpecialFP : FPFormat<7>;
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W { bit hasREX_WPrefix = 1; }
+class LOCK { bit hasLockPrefix = 1; }
+class SegFS { bits<2> SegOvrBits = 1; }
+class SegGS { bits<2> SegOvrBits = 2; }
+class TB { bits<4> Prefix = 1; }
+class REP { bits<4> Prefix = 2; }
+class D8 { bits<4> Prefix = 3; }
+class D9 { bits<4> Prefix = 4; }
+class DA { bits<4> Prefix = 5; }
+class DB { bits<4> Prefix = 6; }
+class DC { bits<4> Prefix = 7; }
+class DD { bits<4> Prefix = 8; }
+class DE { bits<4> Prefix = 9; }
+class DF { bits<4> Prefix = 10; }
+class XD { bits<4> Prefix = 11; }
+class XS { bits<4> Prefix = 12; }
+class T8 { bits<4> Prefix = 13; }
+class TA { bits<4> Prefix = 14; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+ string AsmStr>
+ : Instruction {
+ let Namespace = "X86";
+
+ bits<8> Opcode = opcod;
+ Format Form = f;
+ bits<6> FormBits = Form.Value;
+ ImmType ImmT = i;
+ bits<3> ImmTypeBits = ImmT.Value;
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ string AsmString = AsmStr;
+
+ //
+ // Attributes specific to X86 instructions...
+ //
+ bit hasOpSizePrefix = 0; // Does this inst have a 0x66 prefix?
+ bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix?
+
+ bits<4> Prefix = 0; // Which prefix byte does this inst have?
+ bit hasREX_WPrefix = 0; // Does this inst requires the REX.W prefix?
+ FPFormat FPForm; // What flavor of FP instruction is this?
+ bits<3> FPFormBits = 0;
+ bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix?
+ bits<2> SegOvrBits = 0; // Segment override prefix.
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
+ : X86Inst<o, f, NoImm, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
+ : X86Inst<o, f, Imm8 , outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
+ : X86Inst<o, f, Imm16, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
+ : X86Inst<o, f, Imm32, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
+ : I<o, F, outs, ins, asm, []> {}
+
+// FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
+ : X86Inst<0, Pseudo, NoImm, outs, ins, ""> {
+ let FPForm = fp; let FPFormBits = FPForm.Value;
+ let Pattern = pattern;
+}
+
+// SSE1 Instruction Templates:
+//
+// SSI - SSE1 instructions with XS prefix.
+// PSI - SSE1 instructions with TB prefix.
+// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+
+// SSE2 Instruction Templates:
+//
+// SDI - SSE2 instructions with XD prefix.
+// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix.
+// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+// PDI - SSE2 instructions with TB and OpSize prefixes.
+// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+//
+// S3I - SSE3 instructions with TB and OpSize prefixes.
+// S3SI - SSE3 instructions with XS prefix.
+// S3DI - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+//
+// SS38I - SSSE3 instructions with T8 prefix.
+// SS3AI - SSSE3 instructions with TA prefix.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. We put those instructions here because they better
+// fit into the SSSE3 instruction category rather than the MMX category.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+//
+// SS48I - SSE 4.1 instructions with T8 prefix.
+// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE41]>;
+
+// SSE4.2 Instruction Templates:
+//
+// SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE42]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii32<o, F, outs, ins, asm, pattern>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm64, outs, ins, asm>, REX_W {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : SSI<o, F, outs, ins, asm, pattern>, REX_W;
+class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : SDI<o, F, outs, ins, asm, pattern>, REX_W;
+class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : PDI<o, F, outs, ins, asm, pattern>, REX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI - MMX instructions with TB prefix.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I - MMX / SSE2 instructions with TB and OpSize prefixes.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXID - MMX instructions with XD prefix.
+// MMXIS - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>;
+
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..2cd3733
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,3227 @@
+//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86GenInstrInfo.inc"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetAsmInfo.h"
+
+using namespace llvm;
+
+namespace {
+ cl::opt<bool>
+ NoFusing("disable-spill-fusing",
+ cl::desc("Disable fusing of spill code into instructions"));
+ cl::opt<bool>
+ PrintFailedFusing("print-failed-fuse-candidates",
+ cl::desc("Print instructions that the allocator wants to"
+ " fuse, but the X86 backend currently can't"),
+ cl::Hidden);
+ cl::opt<bool>
+ ReMatPICStubLoad("remat-pic-stub-load",
+ cl::desc("Re-materialize load from stub in PIC mode"),
+ cl::init(false), cl::Hidden);
+}
+
+X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
+ : TargetInstrInfoImpl(X86Insts, array_lengthof(X86Insts)),
+ TM(tm), RI(tm, *this) {
+ SmallVector<unsigned,16> AmbEntries;
+ static const unsigned OpTbl2Addr[][2] = {
+ { X86::ADC32ri, X86::ADC32mi },
+ { X86::ADC32ri8, X86::ADC32mi8 },
+ { X86::ADC32rr, X86::ADC32mr },
+ { X86::ADC64ri32, X86::ADC64mi32 },
+ { X86::ADC64ri8, X86::ADC64mi8 },
+ { X86::ADC64rr, X86::ADC64mr },
+ { X86::ADD16ri, X86::ADD16mi },
+ { X86::ADD16ri8, X86::ADD16mi8 },
+ { X86::ADD16rr, X86::ADD16mr },
+ { X86::ADD32ri, X86::ADD32mi },
+ { X86::ADD32ri8, X86::ADD32mi8 },
+ { X86::ADD32rr, X86::ADD32mr },
+ { X86::ADD64ri32, X86::ADD64mi32 },
+ { X86::ADD64ri8, X86::ADD64mi8 },
+ { X86::ADD64rr, X86::ADD64mr },
+ { X86::ADD8ri, X86::ADD8mi },
+ { X86::ADD8rr, X86::ADD8mr },
+ { X86::AND16ri, X86::AND16mi },
+ { X86::AND16ri8, X86::AND16mi8 },
+ { X86::AND16rr, X86::AND16mr },
+ { X86::AND32ri, X86::AND32mi },
+ { X86::AND32ri8, X86::AND32mi8 },
+ { X86::AND32rr, X86::AND32mr },
+ { X86::AND64ri32, X86::AND64mi32 },
+ { X86::AND64ri8, X86::AND64mi8 },
+ { X86::AND64rr, X86::AND64mr },
+ { X86::AND8ri, X86::AND8mi },
+ { X86::AND8rr, X86::AND8mr },
+ { X86::DEC16r, X86::DEC16m },
+ { X86::DEC32r, X86::DEC32m },
+ { X86::DEC64_16r, X86::DEC64_16m },
+ { X86::DEC64_32r, X86::DEC64_32m },
+ { X86::DEC64r, X86::DEC64m },
+ { X86::DEC8r, X86::DEC8m },
+ { X86::INC16r, X86::INC16m },
+ { X86::INC32r, X86::INC32m },
+ { X86::INC64_16r, X86::INC64_16m },
+ { X86::INC64_32r, X86::INC64_32m },
+ { X86::INC64r, X86::INC64m },
+ { X86::INC8r, X86::INC8m },
+ { X86::NEG16r, X86::NEG16m },
+ { X86::NEG32r, X86::NEG32m },
+ { X86::NEG64r, X86::NEG64m },
+ { X86::NEG8r, X86::NEG8m },
+ { X86::NOT16r, X86::NOT16m },
+ { X86::NOT32r, X86::NOT32m },
+ { X86::NOT64r, X86::NOT64m },
+ { X86::NOT8r, X86::NOT8m },
+ { X86::OR16ri, X86::OR16mi },
+ { X86::OR16ri8, X86::OR16mi8 },
+ { X86::OR16rr, X86::OR16mr },
+ { X86::OR32ri, X86::OR32mi },
+ { X86::OR32ri8, X86::OR32mi8 },
+ { X86::OR32rr, X86::OR32mr },
+ { X86::OR64ri32, X86::OR64mi32 },
+ { X86::OR64ri8, X86::OR64mi8 },
+ { X86::OR64rr, X86::OR64mr },
+ { X86::OR8ri, X86::OR8mi },
+ { X86::OR8rr, X86::OR8mr },
+ { X86::ROL16r1, X86::ROL16m1 },
+ { X86::ROL16rCL, X86::ROL16mCL },
+ { X86::ROL16ri, X86::ROL16mi },
+ { X86::ROL32r1, X86::ROL32m1 },
+ { X86::ROL32rCL, X86::ROL32mCL },
+ { X86::ROL32ri, X86::ROL32mi },
+ { X86::ROL64r1, X86::ROL64m1 },
+ { X86::ROL64rCL, X86::ROL64mCL },
+ { X86::ROL64ri, X86::ROL64mi },
+ { X86::ROL8r1, X86::ROL8m1 },
+ { X86::ROL8rCL, X86::ROL8mCL },
+ { X86::ROL8ri, X86::ROL8mi },
+ { X86::ROR16r1, X86::ROR16m1 },
+ { X86::ROR16rCL, X86::ROR16mCL },
+ { X86::ROR16ri, X86::ROR16mi },
+ { X86::ROR32r1, X86::ROR32m1 },
+ { X86::ROR32rCL, X86::ROR32mCL },
+ { X86::ROR32ri, X86::ROR32mi },
+ { X86::ROR64r1, X86::ROR64m1 },
+ { X86::ROR64rCL, X86::ROR64mCL },
+ { X86::ROR64ri, X86::ROR64mi },
+ { X86::ROR8r1, X86::ROR8m1 },
+ { X86::ROR8rCL, X86::ROR8mCL },
+ { X86::ROR8ri, X86::ROR8mi },
+ { X86::SAR16r1, X86::SAR16m1 },
+ { X86::SAR16rCL, X86::SAR16mCL },
+ { X86::SAR16ri, X86::SAR16mi },
+ { X86::SAR32r1, X86::SAR32m1 },
+ { X86::SAR32rCL, X86::SAR32mCL },
+ { X86::SAR32ri, X86::SAR32mi },
+ { X86::SAR64r1, X86::SAR64m1 },
+ { X86::SAR64rCL, X86::SAR64mCL },
+ { X86::SAR64ri, X86::SAR64mi },
+ { X86::SAR8r1, X86::SAR8m1 },
+ { X86::SAR8rCL, X86::SAR8mCL },
+ { X86::SAR8ri, X86::SAR8mi },
+ { X86::SBB32ri, X86::SBB32mi },
+ { X86::SBB32ri8, X86::SBB32mi8 },
+ { X86::SBB32rr, X86::SBB32mr },
+ { X86::SBB64ri32, X86::SBB64mi32 },
+ { X86::SBB64ri8, X86::SBB64mi8 },
+ { X86::SBB64rr, X86::SBB64mr },
+ { X86::SHL16rCL, X86::SHL16mCL },
+ { X86::SHL16ri, X86::SHL16mi },
+ { X86::SHL32rCL, X86::SHL32mCL },
+ { X86::SHL32ri, X86::SHL32mi },
+ { X86::SHL64rCL, X86::SHL64mCL },
+ { X86::SHL64ri, X86::SHL64mi },
+ { X86::SHL8rCL, X86::SHL8mCL },
+ { X86::SHL8ri, X86::SHL8mi },
+ { X86::SHLD16rrCL, X86::SHLD16mrCL },
+ { X86::SHLD16rri8, X86::SHLD16mri8 },
+ { X86::SHLD32rrCL, X86::SHLD32mrCL },
+ { X86::SHLD32rri8, X86::SHLD32mri8 },
+ { X86::SHLD64rrCL, X86::SHLD64mrCL },
+ { X86::SHLD64rri8, X86::SHLD64mri8 },
+ { X86::SHR16r1, X86::SHR16m1 },
+ { X86::SHR16rCL, X86::SHR16mCL },
+ { X86::SHR16ri, X86::SHR16mi },
+ { X86::SHR32r1, X86::SHR32m1 },
+ { X86::SHR32rCL, X86::SHR32mCL },
+ { X86::SHR32ri, X86::SHR32mi },
+ { X86::SHR64r1, X86::SHR64m1 },
+ { X86::SHR64rCL, X86::SHR64mCL },
+ { X86::SHR64ri, X86::SHR64mi },
+ { X86::SHR8r1, X86::SHR8m1 },
+ { X86::SHR8rCL, X86::SHR8mCL },
+ { X86::SHR8ri, X86::SHR8mi },
+ { X86::SHRD16rrCL, X86::SHRD16mrCL },
+ { X86::SHRD16rri8, X86::SHRD16mri8 },
+ { X86::SHRD32rrCL, X86::SHRD32mrCL },
+ { X86::SHRD32rri8, X86::SHRD32mri8 },
+ { X86::SHRD64rrCL, X86::SHRD64mrCL },
+ { X86::SHRD64rri8, X86::SHRD64mri8 },
+ { X86::SUB16ri, X86::SUB16mi },
+ { X86::SUB16ri8, X86::SUB16mi8 },
+ { X86::SUB16rr, X86::SUB16mr },
+ { X86::SUB32ri, X86::SUB32mi },
+ { X86::SUB32ri8, X86::SUB32mi8 },
+ { X86::SUB32rr, X86::SUB32mr },
+ { X86::SUB64ri32, X86::SUB64mi32 },
+ { X86::SUB64ri8, X86::SUB64mi8 },
+ { X86::SUB64rr, X86::SUB64mr },
+ { X86::SUB8ri, X86::SUB8mi },
+ { X86::SUB8rr, X86::SUB8mr },
+ { X86::XOR16ri, X86::XOR16mi },
+ { X86::XOR16ri8, X86::XOR16mi8 },
+ { X86::XOR16rr, X86::XOR16mr },
+ { X86::XOR32ri, X86::XOR32mi },
+ { X86::XOR32ri8, X86::XOR32mi8 },
+ { X86::XOR32rr, X86::XOR32mr },
+ { X86::XOR64ri32, X86::XOR64mi32 },
+ { X86::XOR64ri8, X86::XOR64mi8 },
+ { X86::XOR64rr, X86::XOR64mr },
+ { X86::XOR8ri, X86::XOR8mi },
+ { X86::XOR8rr, X86::XOR8mr }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
+ unsigned RegOp = OpTbl2Addr[i][0];
+ unsigned MemOp = OpTbl2Addr[i][1];
+ if (!RegOp2MemOpTable2Addr.insert(std::make_pair((unsigned*)RegOp,
+ MemOp)).second)
+ assert(false && "Duplicated entries?");
+ unsigned AuxInfo = 0 | (1 << 4) | (1 << 5); // Index 0,folded load and store
+ if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+ std::make_pair(RegOp,
+ AuxInfo))).second)
+ AmbEntries.push_back(MemOp);
+ }
+
+ // If the third value is 1, then it's folding either a load or a store.
+ static const unsigned OpTbl0[][3] = {
+ { X86::BT16ri8, X86::BT16mi8, 1 },
+ { X86::BT32ri8, X86::BT32mi8, 1 },
+ { X86::BT64ri8, X86::BT64mi8, 1 },
+ { X86::CALL32r, X86::CALL32m, 1 },
+ { X86::CALL64r, X86::CALL64m, 1 },
+ { X86::CMP16ri, X86::CMP16mi, 1 },
+ { X86::CMP16ri8, X86::CMP16mi8, 1 },
+ { X86::CMP16rr, X86::CMP16mr, 1 },
+ { X86::CMP32ri, X86::CMP32mi, 1 },
+ { X86::CMP32ri8, X86::CMP32mi8, 1 },
+ { X86::CMP32rr, X86::CMP32mr, 1 },
+ { X86::CMP64ri32, X86::CMP64mi32, 1 },
+ { X86::CMP64ri8, X86::CMP64mi8, 1 },
+ { X86::CMP64rr, X86::CMP64mr, 1 },
+ { X86::CMP8ri, X86::CMP8mi, 1 },
+ { X86::CMP8rr, X86::CMP8mr, 1 },
+ { X86::DIV16r, X86::DIV16m, 1 },
+ { X86::DIV32r, X86::DIV32m, 1 },
+ { X86::DIV64r, X86::DIV64m, 1 },
+ { X86::DIV8r, X86::DIV8m, 1 },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSmr, 0 },
+ { X86::FsMOVAPDrr, X86::MOVSDmr, 0 },
+ { X86::FsMOVAPSrr, X86::MOVSSmr, 0 },
+ { X86::IDIV16r, X86::IDIV16m, 1 },
+ { X86::IDIV32r, X86::IDIV32m, 1 },
+ { X86::IDIV64r, X86::IDIV64m, 1 },
+ { X86::IDIV8r, X86::IDIV8m, 1 },
+ { X86::IMUL16r, X86::IMUL16m, 1 },
+ { X86::IMUL32r, X86::IMUL32m, 1 },
+ { X86::IMUL64r, X86::IMUL64m, 1 },
+ { X86::IMUL8r, X86::IMUL8m, 1 },
+ { X86::JMP32r, X86::JMP32m, 1 },
+ { X86::JMP64r, X86::JMP64m, 1 },
+ { X86::MOV16ri, X86::MOV16mi, 0 },
+ { X86::MOV16rr, X86::MOV16mr, 0 },
+ { X86::MOV32ri, X86::MOV32mi, 0 },
+ { X86::MOV32rr, X86::MOV32mr, 0 },
+ { X86::MOV64ri32, X86::MOV64mi32, 0 },
+ { X86::MOV64rr, X86::MOV64mr, 0 },
+ { X86::MOV8ri, X86::MOV8mi, 0 },
+ { X86::MOV8rr, X86::MOV8mr, 0 },
+ { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, 0 },
+ { X86::MOVAPDrr, X86::MOVAPDmr, 0 },
+ { X86::MOVAPSrr, X86::MOVAPSmr, 0 },
+ { X86::MOVDQArr, X86::MOVDQAmr, 0 },
+ { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0 },
+ { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0 },
+ { X86::MOVPS2SSrr, X86::MOVPS2SSmr, 0 },
+ { X86::MOVSDrr, X86::MOVSDmr, 0 },
+ { X86::MOVSDto64rr, X86::MOVSDto64mr, 0 },
+ { X86::MOVSS2DIrr, X86::MOVSS2DImr, 0 },
+ { X86::MOVSSrr, X86::MOVSSmr, 0 },
+ { X86::MOVUPDrr, X86::MOVUPDmr, 0 },
+ { X86::MOVUPSrr, X86::MOVUPSmr, 0 },
+ { X86::MUL16r, X86::MUL16m, 1 },
+ { X86::MUL32r, X86::MUL32m, 1 },
+ { X86::MUL64r, X86::MUL64m, 1 },
+ { X86::MUL8r, X86::MUL8m, 1 },
+ { X86::SETAEr, X86::SETAEm, 0 },
+ { X86::SETAr, X86::SETAm, 0 },
+ { X86::SETBEr, X86::SETBEm, 0 },
+ { X86::SETBr, X86::SETBm, 0 },
+ { X86::SETEr, X86::SETEm, 0 },
+ { X86::SETGEr, X86::SETGEm, 0 },
+ { X86::SETGr, X86::SETGm, 0 },
+ { X86::SETLEr, X86::SETLEm, 0 },
+ { X86::SETLr, X86::SETLm, 0 },
+ { X86::SETNEr, X86::SETNEm, 0 },
+ { X86::SETNOr, X86::SETNOm, 0 },
+ { X86::SETNPr, X86::SETNPm, 0 },
+ { X86::SETNSr, X86::SETNSm, 0 },
+ { X86::SETOr, X86::SETOm, 0 },
+ { X86::SETPr, X86::SETPm, 0 },
+ { X86::SETSr, X86::SETSm, 0 },
+ { X86::TAILJMPr, X86::TAILJMPm, 1 },
+ { X86::TEST16ri, X86::TEST16mi, 1 },
+ { X86::TEST32ri, X86::TEST32mi, 1 },
+ { X86::TEST64ri32, X86::TEST64mi32, 1 },
+ { X86::TEST8ri, X86::TEST8mi, 1 }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
+ unsigned RegOp = OpTbl0[i][0];
+ unsigned MemOp = OpTbl0[i][1];
+ if (!RegOp2MemOpTable0.insert(std::make_pair((unsigned*)RegOp,
+ MemOp)).second)
+ assert(false && "Duplicated entries?");
+ unsigned FoldedLoad = OpTbl0[i][2];
+ // Index 0, folded load or store.
+ unsigned AuxInfo = 0 | (FoldedLoad << 4) | ((FoldedLoad^1) << 5);
+ if (RegOp != X86::FsMOVAPDrr && RegOp != X86::FsMOVAPSrr)
+ if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+ std::make_pair(RegOp, AuxInfo))).second)
+ AmbEntries.push_back(MemOp);
+ }
+
+ static const unsigned OpTbl1[][2] = {
+ { X86::CMP16rr, X86::CMP16rm },
+ { X86::CMP32rr, X86::CMP32rm },
+ { X86::CMP64rr, X86::CMP64rm },
+ { X86::CMP8rr, X86::CMP8rm },
+ { X86::CVTSD2SSrr, X86::CVTSD2SSrm },
+ { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm },
+ { X86::CVTSI2SDrr, X86::CVTSI2SDrm },
+ { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm },
+ { X86::CVTSI2SSrr, X86::CVTSI2SSrm },
+ { X86::CVTSS2SDrr, X86::CVTSS2SDrm },
+ { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm },
+ { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm },
+ { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm },
+ { X86::FsMOVAPDrr, X86::MOVSDrm },
+ { X86::FsMOVAPSrr, X86::MOVSSrm },
+ { X86::IMUL16rri, X86::IMUL16rmi },
+ { X86::IMUL16rri8, X86::IMUL16rmi8 },
+ { X86::IMUL32rri, X86::IMUL32rmi },
+ { X86::IMUL32rri8, X86::IMUL32rmi8 },
+ { X86::IMUL64rri32, X86::IMUL64rmi32 },
+ { X86::IMUL64rri8, X86::IMUL64rmi8 },
+ { X86::Int_CMPSDrr, X86::Int_CMPSDrm },
+ { X86::Int_CMPSSrr, X86::Int_CMPSSrm },
+ { X86::Int_COMISDrr, X86::Int_COMISDrm },
+ { X86::Int_COMISSrr, X86::Int_COMISSrm },
+ { X86::Int_CVTDQ2PDrr, X86::Int_CVTDQ2PDrm },
+ { X86::Int_CVTDQ2PSrr, X86::Int_CVTDQ2PSrm },
+ { X86::Int_CVTPD2DQrr, X86::Int_CVTPD2DQrm },
+ { X86::Int_CVTPD2PSrr, X86::Int_CVTPD2PSrm },
+ { X86::Int_CVTPS2DQrr, X86::Int_CVTPS2DQrm },
+ { X86::Int_CVTPS2PDrr, X86::Int_CVTPS2PDrm },
+ { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm },
+ { X86::Int_CVTSD2SIrr, X86::Int_CVTSD2SIrm },
+ { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm },
+ { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm },
+ { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm },
+ { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm },
+ { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm },
+ { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm },
+ { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm },
+ { X86::Int_CVTSS2SIrr, X86::Int_CVTSS2SIrm },
+ { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm },
+ { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm },
+ { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm },
+ { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm },
+ { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm },
+ { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm },
+ { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm },
+ { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm },
+ { X86::MOV16rr, X86::MOV16rm },
+ { X86::MOV32rr, X86::MOV32rm },
+ { X86::MOV64rr, X86::MOV64rm },
+ { X86::MOV64toPQIrr, X86::MOVQI2PQIrm },
+ { X86::MOV64toSDrr, X86::MOV64toSDrm },
+ { X86::MOV8rr, X86::MOV8rm },
+ { X86::MOVAPDrr, X86::MOVAPDrm },
+ { X86::MOVAPSrr, X86::MOVAPSrm },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm },
+ { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm },
+ { X86::MOVDI2SSrr, X86::MOVDI2SSrm },
+ { X86::MOVDQArr, X86::MOVDQArm },
+ { X86::MOVSD2PDrr, X86::MOVSD2PDrm },
+ { X86::MOVSDrr, X86::MOVSDrm },
+ { X86::MOVSHDUPrr, X86::MOVSHDUPrm },
+ { X86::MOVSLDUPrr, X86::MOVSLDUPrm },
+ { X86::MOVSS2PSrr, X86::MOVSS2PSrm },
+ { X86::MOVSSrr, X86::MOVSSrm },
+ { X86::MOVSX16rr8, X86::MOVSX16rm8 },
+ { X86::MOVSX32rr16, X86::MOVSX32rm16 },
+ { X86::MOVSX32rr8, X86::MOVSX32rm8 },
+ { X86::MOVSX64rr16, X86::MOVSX64rm16 },
+ { X86::MOVSX64rr32, X86::MOVSX64rm32 },
+ { X86::MOVSX64rr8, X86::MOVSX64rm8 },
+ { X86::MOVUPDrr, X86::MOVUPDrm },
+ { X86::MOVUPSrr, X86::MOVUPSrm },
+ { X86::MOVZDI2PDIrr, X86::MOVZDI2PDIrm },
+ { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm },
+ { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm },
+ { X86::MOVZX16rr8, X86::MOVZX16rm8 },
+ { X86::MOVZX32rr16, X86::MOVZX32rm16 },
+ { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8 },
+ { X86::MOVZX32rr8, X86::MOVZX32rm8 },
+ { X86::MOVZX64rr16, X86::MOVZX64rm16 },
+ { X86::MOVZX64rr32, X86::MOVZX64rm32 },
+ { X86::MOVZX64rr8, X86::MOVZX64rm8 },
+ { X86::PSHUFDri, X86::PSHUFDmi },
+ { X86::PSHUFHWri, X86::PSHUFHWmi },
+ { X86::PSHUFLWri, X86::PSHUFLWmi },
+ { X86::RCPPSr, X86::RCPPSm },
+ { X86::RCPPSr_Int, X86::RCPPSm_Int },
+ { X86::RSQRTPSr, X86::RSQRTPSm },
+ { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int },
+ { X86::RSQRTSSr, X86::RSQRTSSm },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int },
+ { X86::SQRTPDr, X86::SQRTPDm },
+ { X86::SQRTPDr_Int, X86::SQRTPDm_Int },
+ { X86::SQRTPSr, X86::SQRTPSm },
+ { X86::SQRTPSr_Int, X86::SQRTPSm_Int },
+ { X86::SQRTSDr, X86::SQRTSDm },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int },
+ { X86::SQRTSSr, X86::SQRTSSm },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int },
+ { X86::TEST16rr, X86::TEST16rm },
+ { X86::TEST32rr, X86::TEST32rm },
+ { X86::TEST64rr, X86::TEST64rm },
+ { X86::TEST8rr, X86::TEST8rm },
+ // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+ { X86::UCOMISDrr, X86::UCOMISDrm },
+ { X86::UCOMISSrr, X86::UCOMISSrm }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
+ unsigned RegOp = OpTbl1[i][0];
+ unsigned MemOp = OpTbl1[i][1];
+ if (!RegOp2MemOpTable1.insert(std::make_pair((unsigned*)RegOp,
+ MemOp)).second)
+ assert(false && "Duplicated entries?");
+ unsigned AuxInfo = 1 | (1 << 4); // Index 1, folded load
+ if (RegOp != X86::FsMOVAPDrr && RegOp != X86::FsMOVAPSrr)
+ if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+ std::make_pair(RegOp, AuxInfo))).second)
+ AmbEntries.push_back(MemOp);
+ }
+
+ static const unsigned OpTbl2[][2] = {
+ { X86::ADC32rr, X86::ADC32rm },
+ { X86::ADC64rr, X86::ADC64rm },
+ { X86::ADD16rr, X86::ADD16rm },
+ { X86::ADD32rr, X86::ADD32rm },
+ { X86::ADD64rr, X86::ADD64rm },
+ { X86::ADD8rr, X86::ADD8rm },
+ { X86::ADDPDrr, X86::ADDPDrm },
+ { X86::ADDPSrr, X86::ADDPSrm },
+ { X86::ADDSDrr, X86::ADDSDrm },
+ { X86::ADDSSrr, X86::ADDSSrm },
+ { X86::ADDSUBPDrr, X86::ADDSUBPDrm },
+ { X86::ADDSUBPSrr, X86::ADDSUBPSrm },
+ { X86::AND16rr, X86::AND16rm },
+ { X86::AND32rr, X86::AND32rm },
+ { X86::AND64rr, X86::AND64rm },
+ { X86::AND8rr, X86::AND8rm },
+ { X86::ANDNPDrr, X86::ANDNPDrm },
+ { X86::ANDNPSrr, X86::ANDNPSrm },
+ { X86::ANDPDrr, X86::ANDPDrm },
+ { X86::ANDPSrr, X86::ANDPSrm },
+ { X86::CMOVA16rr, X86::CMOVA16rm },
+ { X86::CMOVA32rr, X86::CMOVA32rm },
+ { X86::CMOVA64rr, X86::CMOVA64rm },
+ { X86::CMOVAE16rr, X86::CMOVAE16rm },
+ { X86::CMOVAE32rr, X86::CMOVAE32rm },
+ { X86::CMOVAE64rr, X86::CMOVAE64rm },
+ { X86::CMOVB16rr, X86::CMOVB16rm },
+ { X86::CMOVB32rr, X86::CMOVB32rm },
+ { X86::CMOVB64rr, X86::CMOVB64rm },
+ { X86::CMOVBE16rr, X86::CMOVBE16rm },
+ { X86::CMOVBE32rr, X86::CMOVBE32rm },
+ { X86::CMOVBE64rr, X86::CMOVBE64rm },
+ { X86::CMOVE16rr, X86::CMOVE16rm },
+ { X86::CMOVE32rr, X86::CMOVE32rm },
+ { X86::CMOVE64rr, X86::CMOVE64rm },
+ { X86::CMOVG16rr, X86::CMOVG16rm },
+ { X86::CMOVG32rr, X86::CMOVG32rm },
+ { X86::CMOVG64rr, X86::CMOVG64rm },
+ { X86::CMOVGE16rr, X86::CMOVGE16rm },
+ { X86::CMOVGE32rr, X86::CMOVGE32rm },
+ { X86::CMOVGE64rr, X86::CMOVGE64rm },
+ { X86::CMOVL16rr, X86::CMOVL16rm },
+ { X86::CMOVL32rr, X86::CMOVL32rm },
+ { X86::CMOVL64rr, X86::CMOVL64rm },
+ { X86::CMOVLE16rr, X86::CMOVLE16rm },
+ { X86::CMOVLE32rr, X86::CMOVLE32rm },
+ { X86::CMOVLE64rr, X86::CMOVLE64rm },
+ { X86::CMOVNE16rr, X86::CMOVNE16rm },
+ { X86::CMOVNE32rr, X86::CMOVNE32rm },
+ { X86::CMOVNE64rr, X86::CMOVNE64rm },
+ { X86::CMOVNO16rr, X86::CMOVNO16rm },
+ { X86::CMOVNO32rr, X86::CMOVNO32rm },
+ { X86::CMOVNO64rr, X86::CMOVNO64rm },
+ { X86::CMOVNP16rr, X86::CMOVNP16rm },
+ { X86::CMOVNP32rr, X86::CMOVNP32rm },
+ { X86::CMOVNP64rr, X86::CMOVNP64rm },
+ { X86::CMOVNS16rr, X86::CMOVNS16rm },
+ { X86::CMOVNS32rr, X86::CMOVNS32rm },
+ { X86::CMOVNS64rr, X86::CMOVNS64rm },
+ { X86::CMOVO16rr, X86::CMOVO16rm },
+ { X86::CMOVO32rr, X86::CMOVO32rm },
+ { X86::CMOVO64rr, X86::CMOVO64rm },
+ { X86::CMOVP16rr, X86::CMOVP16rm },
+ { X86::CMOVP32rr, X86::CMOVP32rm },
+ { X86::CMOVP64rr, X86::CMOVP64rm },
+ { X86::CMOVS16rr, X86::CMOVS16rm },
+ { X86::CMOVS32rr, X86::CMOVS32rm },
+ { X86::CMOVS64rr, X86::CMOVS64rm },
+ { X86::CMPPDrri, X86::CMPPDrmi },
+ { X86::CMPPSrri, X86::CMPPSrmi },
+ { X86::CMPSDrr, X86::CMPSDrm },
+ { X86::CMPSSrr, X86::CMPSSrm },
+ { X86::DIVPDrr, X86::DIVPDrm },
+ { X86::DIVPSrr, X86::DIVPSrm },
+ { X86::DIVSDrr, X86::DIVSDrm },
+ { X86::DIVSSrr, X86::DIVSSrm },
+ { X86::FsANDNPDrr, X86::FsANDNPDrm },
+ { X86::FsANDNPSrr, X86::FsANDNPSrm },
+ { X86::FsANDPDrr, X86::FsANDPDrm },
+ { X86::FsANDPSrr, X86::FsANDPSrm },
+ { X86::FsORPDrr, X86::FsORPDrm },
+ { X86::FsORPSrr, X86::FsORPSrm },
+ { X86::FsXORPDrr, X86::FsXORPDrm },
+ { X86::FsXORPSrr, X86::FsXORPSrm },
+ { X86::HADDPDrr, X86::HADDPDrm },
+ { X86::HADDPSrr, X86::HADDPSrm },
+ { X86::HSUBPDrr, X86::HSUBPDrm },
+ { X86::HSUBPSrr, X86::HSUBPSrm },
+ { X86::IMUL16rr, X86::IMUL16rm },
+ { X86::IMUL32rr, X86::IMUL32rm },
+ { X86::IMUL64rr, X86::IMUL64rm },
+ { X86::MAXPDrr, X86::MAXPDrm },
+ { X86::MAXPDrr_Int, X86::MAXPDrm_Int },
+ { X86::MAXPSrr, X86::MAXPSrm },
+ { X86::MAXPSrr_Int, X86::MAXPSrm_Int },
+ { X86::MAXSDrr, X86::MAXSDrm },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int },
+ { X86::MAXSSrr, X86::MAXSSrm },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int },
+ { X86::MINPDrr, X86::MINPDrm },
+ { X86::MINPDrr_Int, X86::MINPDrm_Int },
+ { X86::MINPSrr, X86::MINPSrm },
+ { X86::MINPSrr_Int, X86::MINPSrm_Int },
+ { X86::MINSDrr, X86::MINSDrm },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int },
+ { X86::MINSSrr, X86::MINSSrm },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int },
+ { X86::MULPDrr, X86::MULPDrm },
+ { X86::MULPSrr, X86::MULPSrm },
+ { X86::MULSDrr, X86::MULSDrm },
+ { X86::MULSSrr, X86::MULSSrm },
+ { X86::OR16rr, X86::OR16rm },
+ { X86::OR32rr, X86::OR32rm },
+ { X86::OR64rr, X86::OR64rm },
+ { X86::OR8rr, X86::OR8rm },
+ { X86::ORPDrr, X86::ORPDrm },
+ { X86::ORPSrr, X86::ORPSrm },
+ { X86::PACKSSDWrr, X86::PACKSSDWrm },
+ { X86::PACKSSWBrr, X86::PACKSSWBrm },
+ { X86::PACKUSWBrr, X86::PACKUSWBrm },
+ { X86::PADDBrr, X86::PADDBrm },
+ { X86::PADDDrr, X86::PADDDrm },
+ { X86::PADDQrr, X86::PADDQrm },
+ { X86::PADDSBrr, X86::PADDSBrm },
+ { X86::PADDSWrr, X86::PADDSWrm },
+ { X86::PADDWrr, X86::PADDWrm },
+ { X86::PANDNrr, X86::PANDNrm },
+ { X86::PANDrr, X86::PANDrm },
+ { X86::PAVGBrr, X86::PAVGBrm },
+ { X86::PAVGWrr, X86::PAVGWrm },
+ { X86::PCMPEQBrr, X86::PCMPEQBrm },
+ { X86::PCMPEQDrr, X86::PCMPEQDrm },
+ { X86::PCMPEQWrr, X86::PCMPEQWrm },
+ { X86::PCMPGTBrr, X86::PCMPGTBrm },
+ { X86::PCMPGTDrr, X86::PCMPGTDrm },
+ { X86::PCMPGTWrr, X86::PCMPGTWrm },
+ { X86::PINSRWrri, X86::PINSRWrmi },
+ { X86::PMADDWDrr, X86::PMADDWDrm },
+ { X86::PMAXSWrr, X86::PMAXSWrm },
+ { X86::PMAXUBrr, X86::PMAXUBrm },
+ { X86::PMINSWrr, X86::PMINSWrm },
+ { X86::PMINUBrr, X86::PMINUBrm },
+ { X86::PMULDQrr, X86::PMULDQrm },
+ { X86::PMULHUWrr, X86::PMULHUWrm },
+ { X86::PMULHWrr, X86::PMULHWrm },
+ { X86::PMULLDrr, X86::PMULLDrm },
+ { X86::PMULLDrr_int, X86::PMULLDrm_int },
+ { X86::PMULLWrr, X86::PMULLWrm },
+ { X86::PMULUDQrr, X86::PMULUDQrm },
+ { X86::PORrr, X86::PORrm },
+ { X86::PSADBWrr, X86::PSADBWrm },
+ { X86::PSLLDrr, X86::PSLLDrm },
+ { X86::PSLLQrr, X86::PSLLQrm },
+ { X86::PSLLWrr, X86::PSLLWrm },
+ { X86::PSRADrr, X86::PSRADrm },
+ { X86::PSRAWrr, X86::PSRAWrm },
+ { X86::PSRLDrr, X86::PSRLDrm },
+ { X86::PSRLQrr, X86::PSRLQrm },
+ { X86::PSRLWrr, X86::PSRLWrm },
+ { X86::PSUBBrr, X86::PSUBBrm },
+ { X86::PSUBDrr, X86::PSUBDrm },
+ { X86::PSUBSBrr, X86::PSUBSBrm },
+ { X86::PSUBSWrr, X86::PSUBSWrm },
+ { X86::PSUBWrr, X86::PSUBWrm },
+ { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm },
+ { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm },
+ { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm },
+ { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm },
+ { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm },
+ { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm },
+ { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm },
+ { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm },
+ { X86::PXORrr, X86::PXORrm },
+ { X86::SBB32rr, X86::SBB32rm },
+ { X86::SBB64rr, X86::SBB64rm },
+ { X86::SHUFPDrri, X86::SHUFPDrmi },
+ { X86::SHUFPSrri, X86::SHUFPSrmi },
+ { X86::SUB16rr, X86::SUB16rm },
+ { X86::SUB32rr, X86::SUB32rm },
+ { X86::SUB64rr, X86::SUB64rm },
+ { X86::SUB8rr, X86::SUB8rm },
+ { X86::SUBPDrr, X86::SUBPDrm },
+ { X86::SUBPSrr, X86::SUBPSrm },
+ { X86::SUBSDrr, X86::SUBSDrm },
+ { X86::SUBSSrr, X86::SUBSSrm },
+ // FIXME: TEST*rr -> swapped operand of TEST*mr.
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrm },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrm },
+ { X86::UNPCKLPDrr, X86::UNPCKLPDrm },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrm },
+ { X86::XOR16rr, X86::XOR16rm },
+ { X86::XOR32rr, X86::XOR32rm },
+ { X86::XOR64rr, X86::XOR64rm },
+ { X86::XOR8rr, X86::XOR8rm },
+ { X86::XORPDrr, X86::XORPDrm },
+ { X86::XORPSrr, X86::XORPSrm }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
+ unsigned RegOp = OpTbl2[i][0];
+ unsigned MemOp = OpTbl2[i][1];
+ if (!RegOp2MemOpTable2.insert(std::make_pair((unsigned*)RegOp,
+ MemOp)).second)
+ assert(false && "Duplicated entries?");
+ unsigned AuxInfo = 2 | (1 << 4); // Index 2, folded load
+ if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+ std::make_pair(RegOp, AuxInfo))).second)
+ AmbEntries.push_back(MemOp);
+ }
+
+ // Remove ambiguous entries.
+ assert(AmbEntries.empty() && "Duplicated entries in unfolding maps?");
+}
+
+bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case X86::MOV8rr:
+ case X86::MOV8rr_NOREX:
+ case X86::MOV16rr:
+ case X86::MOV32rr:
+ case X86::MOV64rr:
+ case X86::MOVSSrr:
+ case X86::MOVSDrr:
+
+ // FP Stack register class copies
+ case X86::MOV_Fp3232: case X86::MOV_Fp6464: case X86::MOV_Fp8080:
+ case X86::MOV_Fp3264: case X86::MOV_Fp3280:
+ case X86::MOV_Fp6432: case X86::MOV_Fp8032:
+
+ case X86::FsMOVAPSrr:
+ case X86::FsMOVAPDrr:
+ case X86::MOVAPSrr:
+ case X86::MOVAPDrr:
+ case X86::MOVDQArr:
+ case X86::MOVSS2PSrr:
+ case X86::MOVSD2PDrr:
+ case X86::MOVPS2SSrr:
+ case X86::MOVPD2SDrr:
+ case X86::MMX_MOVQ64rr:
+ assert(MI.getNumOperands() >= 2 &&
+ MI.getOperand(0).isReg() &&
+ MI.getOperand(1).isReg() &&
+ "invalid register-register move instruction");
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ SrcSubIdx = MI.getOperand(1).getSubReg();
+ DstSubIdx = MI.getOperand(0).getSubReg();
+ return true;
+ }
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isReg() && MI->getOperand(4).isImm() &&
+ MI->getOperand(2).getImm() == 1 &&
+ MI->getOperand(3).getReg() == 0 &&
+ MI->getOperand(4).getImm() == 0) {
+ FrameIndex = MI->getOperand(1).getIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ }
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV8mr:
+ case X86::MOV16mr:
+ case X86::MOV32mr:
+ case X86::MOV64mr:
+ case X86::ST_FpP64m:
+ case X86::MOVSSmr:
+ case X86::MOVSDmr:
+ case X86::MOVAPSmr:
+ case X86::MOVAPDmr:
+ case X86::MOVDQAmr:
+ case X86::MMX_MOVD64mr:
+ case X86::MMX_MOVQ64mr:
+ case X86::MMX_MOVNTQmr:
+ if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
+ MI->getOperand(2).isReg() && MI->getOperand(3).isImm() &&
+ MI->getOperand(1).getImm() == 1 &&
+ MI->getOperand(2).getReg() == 0 &&
+ MI->getOperand(3).getImm() == 0) {
+ FrameIndex = MI->getOperand(0).getIndex();
+ return MI->getOperand(X86AddrNumOperands).getReg();
+ }
+ break;
+ }
+ return 0;
+}
+
+
+/// regIsPICBase - Return true if register is PIC base (i.e.g defined by
+/// X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+ bool isPICBase = false;
+ for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+ E = MRI.def_end(); I != E; ++I) {
+ MachineInstr *DefMI = I.getOperand().getParent();
+ if (DefMI->getOpcode() != X86::MOVPC32r)
+ return false;
+ assert(!isPICBase && "More than one PIC base?");
+ isPICBase = true;
+ }
+ return isPICBase;
+}
+
+/// isGVStub - Return true if the GV requires an extra load to get the
+/// real address.
+static inline bool isGVStub(GlobalValue *GV, X86TargetMachine &TM) {
+ return TM.getSubtarget<X86Subtarget>().GVRequiresExtraLoad(GV, TM, false);
+}
+
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm: {
+ // Loads from constant pools are trivially rematerializable.
+ if (MI->getOperand(1).isReg() &&
+ MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+ (MI->getOperand(4).isCPI() ||
+ (MI->getOperand(4).isGlobal() &&
+ isGVStub(MI->getOperand(4).getGlobal(), TM)))) {
+ unsigned BaseReg = MI->getOperand(1).getReg();
+ if (BaseReg == 0)
+ return true;
+ // Allow re-materialization of PIC load.
+ if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+ return false;
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool isPICBase = false;
+ for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+ E = MRI.def_end(); I != E; ++I) {
+ MachineInstr *DefMI = I.getOperand().getParent();
+ if (DefMI->getOpcode() != X86::MOVPC32r)
+ return false;
+ assert(!isPICBase && "More than one PIC base?");
+ isPICBase = true;
+ }
+ return isPICBase;
+ }
+ return false;
+ }
+
+ case X86::LEA32r:
+ case X86::LEA64r: {
+ if (MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+ !MI->getOperand(4).isReg()) {
+ // lea fi#, lea GV, etc. are all rematerializable.
+ if (!MI->getOperand(1).isReg())
+ return true;
+ unsigned BaseReg = MI->getOperand(1).getReg();
+ if (BaseReg == 0)
+ return true;
+ // Allow re-materialization of lea PICBase + x.
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+ }
+
+ // All other instructions marked M_REMATERIALIZABLE are always trivially
+ // rematerializable.
+ return true;
+}
+
+/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
+/// would clobber the EFLAGS condition register. Note the result may be
+/// conservative. If it cannot definitely determine the safety after visiting
+/// two instructions it assumes it's not safe.
+static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
+ // It's always safe to clobber EFLAGS at the end of a block.
+ if (I == MBB.end())
+ return true;
+
+ // For compile time consideration, if we are not able to determine the
+ // safety after visiting 2 instructions, we will assume it's not safe.
+ for (unsigned i = 0; i < 2; ++i) {
+ bool SeenDef = false;
+ for (unsigned j = 0, e = I->getNumOperands(); j != e; ++j) {
+ MachineOperand &MO = I->getOperand(j);
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() == X86::EFLAGS) {
+ if (MO.isUse())
+ return false;
+ SeenDef = true;
+ }
+ }
+
+ if (SeenDef)
+ // This instruction defines EFLAGS, no need to look any further.
+ return true;
+ ++I;
+
+ // If we make it to the end of the block, it's safe to clobber EFLAGS.
+ if (I == MBB.end())
+ return true;
+ }
+
+ // Conservative answer.
+ return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg,
+ const MachineInstr *Orig) const {
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (I != MBB.end()) DL = I->getDebugLoc();
+
+ unsigned SubIdx = Orig->getOperand(0).isReg()
+ ? Orig->getOperand(0).getSubReg() : 0;
+ bool ChangeSubIdx = SubIdx != 0;
+ if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
+ DestReg = RI.getSubReg(DestReg, SubIdx);
+ SubIdx = 0;
+ }
+
+ // MOV32r0 etc. are implemented with xor which clobbers condition code.
+ // Re-materialize them as movri instructions to avoid side effects.
+ bool Emitted = false;
+ switch (Orig->getOpcode()) {
+ default: break;
+ case X86::MOV8r0:
+ case X86::MOV16r0:
+ case X86::MOV32r0:
+ case X86::MOV64r0: {
+ if (!isSafeToClobberEFLAGS(MBB, I)) {
+ unsigned Opc = 0;
+ switch (Orig->getOpcode()) {
+ default: break;
+ case X86::MOV8r0: Opc = X86::MOV8ri; break;
+ case X86::MOV16r0: Opc = X86::MOV16ri; break;
+ case X86::MOV32r0: Opc = X86::MOV32ri; break;
+ case X86::MOV64r0: Opc = X86::MOV64ri32; break;
+ }
+ BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0);
+ Emitted = true;
+ }
+ break;
+ }
+ }
+
+ if (!Emitted) {
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+ MI->getOperand(0).setReg(DestReg);
+ MBB.insert(I, MI);
+ }
+
+ if (ChangeSubIdx) {
+ MachineInstr *NewMI = prior(I);
+ NewMI->getOperand(0).setSubReg(SubIdx);
+ }
+}
+
+/// isInvariantLoad - Return true if the specified instruction (which is marked
+/// mayLoad) is loading from a location whose value is invariant across the
+/// function. For example, loading a value from the constant pool or from
+/// from the argument area of a function if it does not change. This should
+/// only return true of *all* loads the instruction does are invariant (if it
+/// does multiple loads).
+bool X86InstrInfo::isInvariantLoad(const MachineInstr *MI) const {
+ // This code cares about loads from three cases: constant pool entries,
+ // invariant argument slots, and global stubs. In order to handle these cases
+ // for all of the myriad of X86 instructions, we just scan for a CP/FI/GV
+ // operand and base our analysis on it. This is safe because the address of
+ // none of these three cases is ever used as anything other than a load base
+ // and X86 doesn't have any instructions that load from multiple places.
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ // Loads from constant pools are trivially invariant.
+ if (MO.isCPI())
+ return true;
+
+ if (MO.isGlobal())
+ return isGVStub(MO.getGlobal(), TM);
+
+ // If this is a load from an invariant stack slot, the load is a constant.
+ if (MO.isFI()) {
+ const MachineFrameInfo &MFI =
+ *MI->getParent()->getParent()->getFrameInfo();
+ int Idx = MO.getIndex();
+ return MFI.isFixedObjectIndex(Idx) && MFI.isImmutableObjectIndex(Idx);
+ }
+ }
+
+ // All other instances of these instructions are presumed to have other
+ // issues.
+ return false;
+}
+
+/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
+/// is not marked dead.
+static bool hasLiveCondCodeDef(MachineInstr *MI) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDef() &&
+ MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand. This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const {
+ MachineInstr *MI = MBBI;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ // All instructions input are two-addr instructions. Get the known operands.
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src = MI->getOperand(1).getReg();
+ bool isDead = MI->getOperand(0).isDead();
+ bool isKill = MI->getOperand(1).isKill();
+
+ MachineInstr *NewMI = NULL;
+ // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
+ // we have better subtarget support, enable the 16-bit LEA generation here.
+ bool DisableLEA16 = true;
+
+ unsigned MIOpc = MI->getOpcode();
+ switch (MIOpc) {
+ case X86::SHUFPSrri: {
+ assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+ if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+
+ unsigned B = MI->getOperand(1).getReg();
+ unsigned C = MI->getOperand(2).getReg();
+ if (B != C) return 0;
+ unsigned A = MI->getOperand(0).getReg();
+ unsigned M = MI->getOperand(3).getImm();
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+ .addReg(A, RegState::Define | getDeadRegState(isDead))
+ .addReg(B, getKillRegState(isKill)).addImm(M);
+ break;
+ }
+ case X86::SHL64ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(0).addImm(1 << ShAmt)
+ .addReg(Src, getKillRegState(isKill))
+ .addImm(0);
+ break;
+ }
+ case X86::SHL32ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() ?
+ X86::LEA64_32r : X86::LEA32r;
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(0).addImm(1 << ShAmt)
+ .addReg(Src, getKillRegState(isKill)).addImm(0);
+ break;
+ }
+ case X86::SHL16ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ if (DisableLEA16) {
+ // If 16-bit LEA is disabled, use 32-bit LEA via subregisters.
+ MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+ unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
+ ? X86::LEA64_32r : X86::LEA32r;
+ unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // well be shifting and then extracting the lower 16-bits.
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+ MachineInstr *InsMI =
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg)
+ .addReg(leaInReg)
+ .addReg(Src, getKillRegState(isKill))
+ .addImm(X86::SUBREG_16BIT);
+
+ NewMI = BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(Opc), leaOutReg)
+ .addReg(0).addImm(1 << ShAmt)
+ .addReg(leaInReg, RegState::Kill)
+ .addImm(0);
+
+ MachineInstr *ExtMI =
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::EXTRACT_SUBREG))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(leaOutReg, RegState::Kill)
+ .addImm(X86::SUBREG_16BIT);
+
+ if (LV) {
+ // Update live variables
+ LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+ LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+ if (isKill)
+ LV->replaceKillInstruction(Src, MI, InsMI);
+ if (isDead)
+ LV->replaceKillInstruction(Dest, MI, ExtMI);
+ }
+ return ExtMI;
+ } else {
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(0).addImm(1 << ShAmt)
+ .addReg(Src, getKillRegState(isKill))
+ .addImm(0);
+ }
+ break;
+ }
+ default: {
+ // The following opcodes also sets the condition code register(s). Only
+ // convert them to equivalent lea if the condition code register def's
+ // are dead!
+ if (hasLiveCondCodeDef(MI))
+ return 0;
+
+ bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+ switch (MIOpc) {
+ default: return 0;
+ case X86::INC64r:
+ case X86::INC32r:
+ case X86::INC64_32r: {
+ assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+ unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, 1);
+ break;
+ }
+ case X86::INC16r:
+ case X86::INC64_16r:
+ if (DisableLEA16) return 0;
+ assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+ NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, 1);
+ break;
+ case X86::DEC64r:
+ case X86::DEC32r:
+ case X86::DEC64_32r: {
+ assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+ unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, -1);
+ break;
+ }
+ case X86::DEC16r:
+ case X86::DEC64_16r:
+ if (DisableLEA16) return 0;
+ assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+ NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, -1);
+ break;
+ case X86::ADD64rr:
+ case X86::ADD32rr: {
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = MIOpc == X86::ADD64rr ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+ NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, Src2, isKill2);
+ if (LV && isKill2)
+ LV->replaceKillInstruction(Src2, MI, NewMI);
+ break;
+ }
+ case X86::ADD16rr: {
+ if (DisableLEA16) return 0;
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+ NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, Src2, isKill2);
+ if (LV && isKill2)
+ LV->replaceKillInstruction(Src2, MI, NewMI);
+ break;
+ }
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ if (MI->getOperand(2).isImm())
+ NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, MI->getOperand(2).getImm());
+ break;
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ if (MI->getOperand(2).isImm()) {
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, MI->getOperand(2).getImm());
+ }
+ break;
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ if (DisableLEA16) return 0;
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ if (MI->getOperand(2).isImm())
+ NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, MI->getOperand(2).getImm());
+ break;
+ case X86::SHL16ri:
+ if (DisableLEA16) return 0;
+ case X86::SHL32ri:
+ case X86::SHL64ri: {
+ assert(MI->getNumOperands() >= 3 && MI->getOperand(2).isImm() &&
+ "Unknown shl instruction!");
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 1 || ShAmt == 2 || ShAmt == 3) {
+ X86AddressMode AM;
+ AM.Scale = 1 << ShAmt;
+ AM.IndexReg = Src;
+ unsigned Opc = MIOpc == X86::SHL64ri ? X86::LEA64r
+ : (MIOpc == X86::SHL32ri
+ ? (is64Bit ? X86::LEA64_32r : X86::LEA32r) : X86::LEA16r);
+ NewMI = addFullAddress(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)), AM);
+ if (isKill)
+ NewMI->getOperand(3).setIsKill(true);
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ if (!NewMI) return 0;
+
+ if (LV) { // Update live variables
+ if (isKill)
+ LV->replaceKillInstruction(Src, MI, NewMI);
+ if (isDead)
+ LV->replaceKillInstruction(Dest, MI, NewMI);
+ }
+
+ MFI->insert(MBBI, NewMI); // Insert the new inst
+ return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *
+X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+ switch (MI->getOpcode()) {
+ case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+ case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+ case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+ case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+ case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+ case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+ unsigned Opc;
+ unsigned Size;
+ switch (MI->getOpcode()) {
+ default: assert(0 && "Unreachable!");
+ case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+ case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+ case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+ case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+ case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+ case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+ }
+ unsigned Amt = MI->getOperand(3).getImm();
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ MI->getOperand(3).setImm(Size-Amt);
+ return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+ }
+ case X86::CMOVB16rr:
+ case X86::CMOVB32rr:
+ case X86::CMOVB64rr:
+ case X86::CMOVAE16rr:
+ case X86::CMOVAE32rr:
+ case X86::CMOVAE64rr:
+ case X86::CMOVE16rr:
+ case X86::CMOVE32rr:
+ case X86::CMOVE64rr:
+ case X86::CMOVNE16rr:
+ case X86::CMOVNE32rr:
+ case X86::CMOVNE64rr:
+ case X86::CMOVBE16rr:
+ case X86::CMOVBE32rr:
+ case X86::CMOVBE64rr:
+ case X86::CMOVA16rr:
+ case X86::CMOVA32rr:
+ case X86::CMOVA64rr:
+ case X86::CMOVL16rr:
+ case X86::CMOVL32rr:
+ case X86::CMOVL64rr:
+ case X86::CMOVGE16rr:
+ case X86::CMOVGE32rr:
+ case X86::CMOVGE64rr:
+ case X86::CMOVLE16rr:
+ case X86::CMOVLE32rr:
+ case X86::CMOVLE64rr:
+ case X86::CMOVG16rr:
+ case X86::CMOVG32rr:
+ case X86::CMOVG64rr:
+ case X86::CMOVS16rr:
+ case X86::CMOVS32rr:
+ case X86::CMOVS64rr:
+ case X86::CMOVNS16rr:
+ case X86::CMOVNS32rr:
+ case X86::CMOVNS64rr:
+ case X86::CMOVP16rr:
+ case X86::CMOVP32rr:
+ case X86::CMOVP64rr:
+ case X86::CMOVNP16rr:
+ case X86::CMOVNP32rr:
+ case X86::CMOVNP64rr:
+ case X86::CMOVO16rr:
+ case X86::CMOVO32rr:
+ case X86::CMOVO64rr:
+ case X86::CMOVNO16rr:
+ case X86::CMOVNO32rr:
+ case X86::CMOVNO64rr: {
+ unsigned Opc = 0;
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
+ case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
+ case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
+ case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+ case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+ case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+ case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
+ case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
+ case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
+ case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+ case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+ case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+ case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+ case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+ case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+ case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
+ case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
+ case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
+ case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
+ case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
+ case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
+ case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+ case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+ case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+ case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+ case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+ case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+ case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
+ case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
+ case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
+ case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
+ case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
+ case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
+ case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+ case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+ case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+ case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
+ case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
+ case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
+ case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+ case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+ case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+ case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
+ case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
+ case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
+ case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+ case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+ case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+ }
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ // Fallthrough intended.
+ }
+ default:
+ return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+ }
+}
+
+static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+ switch (BrOpc) {
+ default: return X86::COND_INVALID;
+ case X86::JE: return X86::COND_E;
+ case X86::JNE: return X86::COND_NE;
+ case X86::JL: return X86::COND_L;
+ case X86::JLE: return X86::COND_LE;
+ case X86::JG: return X86::COND_G;
+ case X86::JGE: return X86::COND_GE;
+ case X86::JB: return X86::COND_B;
+ case X86::JBE: return X86::COND_BE;
+ case X86::JA: return X86::COND_A;
+ case X86::JAE: return X86::COND_AE;
+ case X86::JS: return X86::COND_S;
+ case X86::JNS: return X86::COND_NS;
+ case X86::JP: return X86::COND_P;
+ case X86::JNP: return X86::COND_NP;
+ case X86::JO: return X86::COND_O;
+ case X86::JNO: return X86::COND_NO;
+ }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+ switch (CC) {
+ default: assert(0 && "Illegal condition code!");
+ case X86::COND_E: return X86::JE;
+ case X86::COND_NE: return X86::JNE;
+ case X86::COND_L: return X86::JL;
+ case X86::COND_LE: return X86::JLE;
+ case X86::COND_G: return X86::JG;
+ case X86::COND_GE: return X86::JGE;
+ case X86::COND_B: return X86::JB;
+ case X86::COND_BE: return X86::JBE;
+ case X86::COND_A: return X86::JA;
+ case X86::COND_AE: return X86::JAE;
+ case X86::COND_S: return X86::JS;
+ case X86::COND_NS: return X86::JNS;
+ case X86::COND_P: return X86::JP;
+ case X86::COND_NP: return X86::JNP;
+ case X86::COND_O: return X86::JO;
+ case X86::COND_NO: return X86::JNO;
+ }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: assert(0 && "Illegal condition code!");
+ case X86::COND_E: return X86::COND_NE;
+ case X86::COND_NE: return X86::COND_E;
+ case X86::COND_L: return X86::COND_GE;
+ case X86::COND_LE: return X86::COND_G;
+ case X86::COND_G: return X86::COND_LE;
+ case X86::COND_GE: return X86::COND_L;
+ case X86::COND_B: return X86::COND_AE;
+ case X86::COND_BE: return X86::COND_A;
+ case X86::COND_A: return X86::COND_BE;
+ case X86::COND_AE: return X86::COND_B;
+ case X86::COND_S: return X86::COND_NS;
+ case X86::COND_NS: return X86::COND_S;
+ case X86::COND_P: return X86::COND_NP;
+ case X86::COND_NP: return X86::COND_P;
+ case X86::COND_O: return X86::COND_NO;
+ case X86::COND_NO: return X86::COND_O;
+ }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+ const TargetInstrDesc &TID = MI->getDesc();
+ if (!TID.isTerminator()) return false;
+
+ // Conditional branch is a special case.
+ if (TID.isBranch() && !TID.isBarrier())
+ return true;
+ if (!TID.isPredicable())
+ return true;
+ return !isPredicated(MI);
+}
+
+// For purposes of branch analysis do not count FP_REG_KILL as a terminator.
+static bool isBrAnalysisUnpredicatedTerminator(const MachineInstr *MI,
+ const X86InstrInfo &TII) {
+ if (MI->getOpcode() == X86::FP_REG_KILL)
+ return false;
+ return TII.isUnpredicatedTerminator(MI);
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ // Working from the bottom, when we see a non-terminator
+ // instruction, we're done.
+ if (!isBrAnalysisUnpredicatedTerminator(I, *this))
+ break;
+ // A terminator that isn't a branch can't easily be handled
+ // by this analysis.
+ if (!I->getDesc().isBranch())
+ return true;
+ // Handle unconditional branches.
+ if (I->getOpcode() == X86::JMP) {
+ if (!AllowModify) {
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a JMP, delete them.
+ while (next(I) != MBB.end())
+ next(I)->eraseFromParent();
+ Cond.clear();
+ FBB = 0;
+ // Delete the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ TBB = 0;
+ I->eraseFromParent();
+ I = MBB.end();
+ continue;
+ }
+ // TBB is used to indicate the unconditinal destination.
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+ // Handle conditional branches.
+ X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode());
+ if (BranchCode == X86::COND_INVALID)
+ return true; // Can't handle indirect branch.
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ FBB = TBB;
+ TBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ continue;
+ }
+ // Handle subsequent conditional branches. Only handle the case
+ // where all conditional branches branch to the same destination
+ // and their condition opcodes fit one of the special
+ // multi-branch idioms.
+ assert(Cond.size() == 1);
+ assert(TBB);
+ // Only handle the case where all conditional branches branch to
+ // the same destination.
+ if (TBB != I->getOperand(0).getMBB())
+ return true;
+ X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+ // If the conditions are the same, we can leave them alone.
+ if (OldBranchCode == BranchCode)
+ continue;
+ // If they differ, see if they fit one of the known patterns.
+ // Theoretically we could handle more patterns here, but
+ // we shouldn't expect to see them if instruction selection
+ // has done a reasonable job.
+ if ((OldBranchCode == X86::COND_NP &&
+ BranchCode == X86::COND_E) ||
+ (OldBranchCode == X86::COND_E &&
+ BranchCode == X86::COND_NP))
+ BranchCode = X86::COND_NP_OR_E;
+ else if ((OldBranchCode == X86::COND_P &&
+ BranchCode == X86::COND_NE) ||
+ (OldBranchCode == X86::COND_NE &&
+ BranchCode == X86::COND_P))
+ BranchCode = X86::COND_NE_OR_P;
+ else
+ return true;
+ // Update the MachineOperand.
+ Cond[0].setImm(BranchCode);
+ }
+
+ return false;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->getOpcode() != X86::JMP &&
+ GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond) const {
+ // FIXME this should probably have a DebugLoc operand
+ DebugLoc dl = DebugLoc::getUnknownLoc();
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "X86 branch conditions have one component!");
+
+ if (Cond.empty()) {
+ // Unconditional branch?
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, dl, get(X86::JMP)).addMBB(TBB);
+ return 1;
+ }
+
+ // Conditional branch.
+ unsigned Count = 0;
+ X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+ switch (CC) {
+ case X86::COND_NP_OR_E:
+ // Synthesize NP_OR_E with two branches.
+ BuildMI(&MBB, dl, get(X86::JNP)).addMBB(TBB);
+ ++Count;
+ BuildMI(&MBB, dl, get(X86::JE)).addMBB(TBB);
+ ++Count;
+ break;
+ case X86::COND_NE_OR_P:
+ // Synthesize NE_OR_P with two branches.
+ BuildMI(&MBB, dl, get(X86::JNE)).addMBB(TBB);
+ ++Count;
+ BuildMI(&MBB, dl, get(X86::JP)).addMBB(TBB);
+ ++Count;
+ break;
+ default: {
+ unsigned Opc = GetCondBranchFromCond(CC);
+ BuildMI(&MBB, dl, get(Opc)).addMBB(TBB);
+ ++Count;
+ }
+ }
+ if (FBB) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, dl, get(X86::JMP)).addMBB(FBB);
+ ++Count;
+ }
+ return Count;
+}
+
+/// isHReg - Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+ return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *DestRC,
+ const TargetRegisterClass *SrcRC) const {
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+ // Determine if DstRC and SrcRC have a common superclass in common.
+ const TargetRegisterClass *CommonRC = DestRC;
+ if (DestRC == SrcRC)
+ /* Source and destination have the same register class. */;
+ else if (CommonRC->hasSuperClass(SrcRC))
+ CommonRC = SrcRC;
+ else if (!DestRC->hasSubClass(SrcRC))
+ CommonRC = 0;
+
+ if (CommonRC) {
+ unsigned Opc;
+ if (CommonRC == &X86::GR64RegClass) {
+ Opc = X86::MOV64rr;
+ } else if (CommonRC == &X86::GR32RegClass) {
+ Opc = X86::MOV32rr;
+ } else if (CommonRC == &X86::GR16RegClass) {
+ Opc = X86::MOV16rr;
+ } else if (CommonRC == &X86::GR8RegClass) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+ TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8rr_NOREX;
+ else
+ Opc = X86::MOV8rr;
+ } else if (CommonRC == &X86::GR64_ABCDRegClass) {
+ Opc = X86::MOV64rr;
+ } else if (CommonRC == &X86::GR32_ABCDRegClass) {
+ Opc = X86::MOV32rr;
+ } else if (CommonRC == &X86::GR16_ABCDRegClass) {
+ Opc = X86::MOV16rr;
+ } else if (CommonRC == &X86::GR8_ABCD_LRegClass) {
+ Opc = X86::MOV8rr;
+ } else if (CommonRC == &X86::GR8_ABCD_HRegClass) {
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8rr_NOREX;
+ else
+ Opc = X86::MOV8rr;
+ } else if (CommonRC == &X86::GR64_NOREXRegClass) {
+ Opc = X86::MOV64rr;
+ } else if (CommonRC == &X86::GR32_NOREXRegClass) {
+ Opc = X86::MOV32rr;
+ } else if (CommonRC == &X86::GR16_NOREXRegClass) {
+ Opc = X86::MOV16rr;
+ } else if (CommonRC == &X86::GR8_NOREXRegClass) {
+ Opc = X86::MOV8rr;
+ } else if (CommonRC == &X86::RFP32RegClass) {
+ Opc = X86::MOV_Fp3232;
+ } else if (CommonRC == &X86::RFP64RegClass || CommonRC == &X86::RSTRegClass) {
+ Opc = X86::MOV_Fp6464;
+ } else if (CommonRC == &X86::RFP80RegClass) {
+ Opc = X86::MOV_Fp8080;
+ } else if (CommonRC == &X86::FR32RegClass) {
+ Opc = X86::FsMOVAPSrr;
+ } else if (CommonRC == &X86::FR64RegClass) {
+ Opc = X86::FsMOVAPDrr;
+ } else if (CommonRC == &X86::VR128RegClass) {
+ Opc = X86::MOVAPSrr;
+ } else if (CommonRC == &X86::VR64RegClass) {
+ Opc = X86::MMX_MOVQ64rr;
+ } else {
+ return false;
+ }
+ BuildMI(MBB, MI, DL, get(Opc), DestReg).addReg(SrcReg);
+ return true;
+ }
+
+ // Moving EFLAGS to / from another register requires a push and a pop.
+ if (SrcRC == &X86::CCRRegClass) {
+ if (SrcReg != X86::EFLAGS)
+ return false;
+ if (DestRC == &X86::GR64RegClass) {
+ BuildMI(MBB, MI, DL, get(X86::PUSHFQ));
+ BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+ return true;
+ } else if (DestRC == &X86::GR32RegClass) {
+ BuildMI(MBB, MI, DL, get(X86::PUSHFD));
+ BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
+ return true;
+ }
+ } else if (DestRC == &X86::CCRRegClass) {
+ if (DestReg != X86::EFLAGS)
+ return false;
+ if (SrcRC == &X86::GR64RegClass) {
+ BuildMI(MBB, MI, DL, get(X86::PUSH64r)).addReg(SrcReg);
+ BuildMI(MBB, MI, DL, get(X86::POPFQ));
+ return true;
+ } else if (SrcRC == &X86::GR32RegClass) {
+ BuildMI(MBB, MI, DL, get(X86::PUSH32r)).addReg(SrcReg);
+ BuildMI(MBB, MI, DL, get(X86::POPFD));
+ return true;
+ }
+ }
+
+ // Moving from ST(0) turns into FpGET_ST0_32 etc.
+ if (SrcRC == &X86::RSTRegClass) {
+ // Copying from ST(0)/ST(1).
+ if (SrcReg != X86::ST0 && SrcReg != X86::ST1)
+ // Can only copy from ST(0)/ST(1) right now
+ return false;
+ bool isST0 = SrcReg == X86::ST0;
+ unsigned Opc;
+ if (DestRC == &X86::RFP32RegClass)
+ Opc = isST0 ? X86::FpGET_ST0_32 : X86::FpGET_ST1_32;
+ else if (DestRC == &X86::RFP64RegClass)
+ Opc = isST0 ? X86::FpGET_ST0_64 : X86::FpGET_ST1_64;
+ else {
+ if (DestRC != &X86::RFP80RegClass)
+ return false;
+ Opc = isST0 ? X86::FpGET_ST0_80 : X86::FpGET_ST1_80;
+ }
+ BuildMI(MBB, MI, DL, get(Opc), DestReg);
+ return true;
+ }
+
+ // Moving to ST(0) turns into FpSET_ST0_32 etc.
+ if (DestRC == &X86::RSTRegClass) {
+ // Copying to ST(0) / ST(1).
+ if (DestReg != X86::ST0 && DestReg != X86::ST1)
+ // Can only copy to TOS right now
+ return false;
+ bool isST0 = DestReg == X86::ST0;
+ unsigned Opc;
+ if (SrcRC == &X86::RFP32RegClass)
+ Opc = isST0 ? X86::FpSET_ST0_32 : X86::FpSET_ST1_32;
+ else if (SrcRC == &X86::RFP64RegClass)
+ Opc = isST0 ? X86::FpSET_ST0_64 : X86::FpSET_ST1_64;
+ else {
+ if (SrcRC != &X86::RFP80RegClass)
+ return false;
+ Opc = isST0 ? X86::FpSET_ST0_80 : X86::FpSET_ST1_80;
+ }
+ BuildMI(MBB, MI, DL, get(Opc)).addReg(SrcReg);
+ return true;
+ }
+
+ // Not yet supported!
+ return false;
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ TargetMachine &TM) {
+ unsigned Opc = 0;
+ if (RC == &X86::GR64RegClass) {
+ Opc = X86::MOV64mr;
+ } else if (RC == &X86::GR32RegClass) {
+ Opc = X86::MOV32mr;
+ } else if (RC == &X86::GR16RegClass) {
+ Opc = X86::MOV16mr;
+ } else if (RC == &X86::GR8RegClass) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if (isHReg(SrcReg) &&
+ TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8mr_NOREX;
+ else
+ Opc = X86::MOV8mr;
+ } else if (RC == &X86::GR64_ABCDRegClass) {
+ Opc = X86::MOV64mr;
+ } else if (RC == &X86::GR32_ABCDRegClass) {
+ Opc = X86::MOV32mr;
+ } else if (RC == &X86::GR16_ABCDRegClass) {
+ Opc = X86::MOV16mr;
+ } else if (RC == &X86::GR8_ABCD_LRegClass) {
+ Opc = X86::MOV8mr;
+ } else if (RC == &X86::GR8_ABCD_HRegClass) {
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8mr_NOREX;
+ else
+ Opc = X86::MOV8mr;
+ } else if (RC == &X86::GR64_NOREXRegClass) {
+ Opc = X86::MOV64mr;
+ } else if (RC == &X86::GR32_NOREXRegClass) {
+ Opc = X86::MOV32mr;
+ } else if (RC == &X86::GR16_NOREXRegClass) {
+ Opc = X86::MOV16mr;
+ } else if (RC == &X86::GR8_NOREXRegClass) {
+ Opc = X86::MOV8mr;
+ } else if (RC == &X86::RFP80RegClass) {
+ Opc = X86::ST_FpP80m; // pops
+ } else if (RC == &X86::RFP64RegClass) {
+ Opc = X86::ST_Fp64m;
+ } else if (RC == &X86::RFP32RegClass) {
+ Opc = X86::ST_Fp32m;
+ } else if (RC == &X86::FR32RegClass) {
+ Opc = X86::MOVSSmr;
+ } else if (RC == &X86::FR64RegClass) {
+ Opc = X86::MOVSDmr;
+ } else if (RC == &X86::VR128RegClass) {
+ // If stack is realigned we can use aligned stores.
+ Opc = isStackAligned ? X86::MOVAPSmr : X86::MOVUPSmr;
+ } else if (RC == &X86::VR64RegClass) {
+ Opc = X86::MMX_MOVQ64mr;
+ } else {
+ assert(0 && "Unknown regclass");
+ abort();
+ }
+
+ return Opc;
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC) const {
+ const MachineFunction &MF = *MBB.getParent();
+ bool isAligned = (RI.getStackAlignment() >= 16) ||
+ RI.needsStackRealignment(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+ bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ bool isAligned = (RI.getStackAlignment() >= 16) ||
+ RI.needsStackRealignment(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ MIB.addReg(SrcReg, getKillRegState(isKill));
+ NewMIs.push_back(MIB);
+}
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const TargetMachine &TM) {
+ unsigned Opc = 0;
+ if (RC == &X86::GR64RegClass) {
+ Opc = X86::MOV64rm;
+ } else if (RC == &X86::GR32RegClass) {
+ Opc = X86::MOV32rm;
+ } else if (RC == &X86::GR16RegClass) {
+ Opc = X86::MOV16rm;
+ } else if (RC == &X86::GR8RegClass) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if (isHReg(DestReg) &&
+ TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8rm_NOREX;
+ else
+ Opc = X86::MOV8rm;
+ } else if (RC == &X86::GR64_ABCDRegClass) {
+ Opc = X86::MOV64rm;
+ } else if (RC == &X86::GR32_ABCDRegClass) {
+ Opc = X86::MOV32rm;
+ } else if (RC == &X86::GR16_ABCDRegClass) {
+ Opc = X86::MOV16rm;
+ } else if (RC == &X86::GR8_ABCD_LRegClass) {
+ Opc = X86::MOV8rm;
+ } else if (RC == &X86::GR8_ABCD_HRegClass) {
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8rm_NOREX;
+ else
+ Opc = X86::MOV8rm;
+ } else if (RC == &X86::GR64_NOREXRegClass) {
+ Opc = X86::MOV64rm;
+ } else if (RC == &X86::GR32_NOREXRegClass) {
+ Opc = X86::MOV32rm;
+ } else if (RC == &X86::GR16_NOREXRegClass) {
+ Opc = X86::MOV16rm;
+ } else if (RC == &X86::GR8_NOREXRegClass) {
+ Opc = X86::MOV8rm;
+ } else if (RC == &X86::RFP80RegClass) {
+ Opc = X86::LD_Fp80m;
+ } else if (RC == &X86::RFP64RegClass) {
+ Opc = X86::LD_Fp64m;
+ } else if (RC == &X86::RFP32RegClass) {
+ Opc = X86::LD_Fp32m;
+ } else if (RC == &X86::FR32RegClass) {
+ Opc = X86::MOVSSrm;
+ } else if (RC == &X86::FR64RegClass) {
+ Opc = X86::MOVSDrm;
+ } else if (RC == &X86::VR128RegClass) {
+ // If stack is realigned we can use aligned loads.
+ Opc = isStackAligned ? X86::MOVAPSrm : X86::MOVUPSrm;
+ } else if (RC == &X86::VR64RegClass) {
+ Opc = X86::MMX_MOVQ64rm;
+ } else {
+ assert(0 && "Unknown regclass");
+ abort();
+ }
+
+ return Opc;
+}
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC) const{
+ const MachineFunction &MF = *MBB.getParent();
+ bool isAligned = (RI.getStackAlignment() >= 16) ||
+ RI.needsStackRealignment(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ bool isAligned = (RI.getStackAlignment() >= 16) ||
+ RI.needsStackRealignment(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI) const {
+ if (CSI.empty())
+ return false;
+
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+ bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+ unsigned SlotSize = is64Bit ? 8 : 4;
+
+ MachineFunction &MF = *MBB.getParent();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ X86FI->setCalleeSavedFrameSize(CSI.size() * SlotSize);
+
+ unsigned Opc = is64Bit ? X86::PUSH64r : X86::PUSH32r;
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ BuildMI(MBB, MI, DL, get(Opc))
+ .addReg(Reg, RegState::Kill);
+ }
+ return true;
+}
+
+bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI) const {
+ if (CSI.empty())
+ return false;
+
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+ bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+ unsigned Opc = is64Bit ? X86::POP64r : X86::POP32r;
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ BuildMI(MBB, MI, DL, get(Opc), Reg);
+ }
+ return true;
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ MachineInstr *MI,
+ const TargetInstrInfo &TII) {
+ // Create the base instruction with the memory operand as the first part.
+ MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+ MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(NewMI);
+ unsigned NumAddrOps = MOs.size();
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ if (NumAddrOps < 4) // FrameIndex only
+ addOffset(MIB, 0);
+
+ // Loop over the rest of the ri operands, converting them over.
+ unsigned NumOps = MI->getDesc().getNumOperands()-2;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ MachineOperand &MO = MI->getOperand(i+2);
+ MIB.addOperand(MO);
+ }
+ for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ MIB.addOperand(MO);
+ }
+ return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF,
+ unsigned Opcode, unsigned OpNo,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ MachineInstr *MI, const TargetInstrInfo &TII) {
+ MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+ MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(NewMI);
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (i == OpNo) {
+ assert(MO.isReg() && "Expected to fold into reg operand!");
+ unsigned NumAddrOps = MOs.size();
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ if (NumAddrOps < 4) // FrameIndex only
+ addOffset(MIB, 0);
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+ return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ MachineInstr *MI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
+
+ unsigned NumAddrOps = MOs.size();
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ if (NumAddrOps < 4) // FrameIndex only
+ addOffset(MIB, 0);
+ return MIB.addImm(0);
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr *MI, unsigned i,
+ const SmallVectorImpl<MachineOperand> &MOs) const{
+ const DenseMap<unsigned*, unsigned> *OpcodeTablePtr = NULL;
+ bool isTwoAddrFold = false;
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ bool isTwoAddr = NumOps > 1 &&
+ MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+ MachineInstr *NewMI = NULL;
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ if (isTwoAddr && NumOps >= 2 && i < 2 &&
+ MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg() &&
+ MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+ OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+ isTwoAddrFold = true;
+ } else if (i == 0) { // If operand 0
+ if (MI->getOpcode() == X86::MOV16r0)
+ NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
+ else if (MI->getOpcode() == X86::MOV32r0)
+ NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+ else if (MI->getOpcode() == X86::MOV64r0)
+ NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
+ else if (MI->getOpcode() == X86::MOV8r0)
+ NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
+ if (NewMI)
+ return NewMI;
+
+ OpcodeTablePtr = &RegOp2MemOpTable0;
+ } else if (i == 1) {
+ OpcodeTablePtr = &RegOp2MemOpTable1;
+ } else if (i == 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2;
+ }
+
+ // If table selected...
+ if (OpcodeTablePtr) {
+ // Find the Opcode to fuse
+ DenseMap<unsigned*, unsigned>::iterator I =
+ OpcodeTablePtr->find((unsigned*)MI->getOpcode());
+ if (I != OpcodeTablePtr->end()) {
+ if (isTwoAddrFold)
+ NewMI = FuseTwoAddrInst(MF, I->second, MOs, MI, *this);
+ else
+ NewMI = FuseInst(MF, I->second, i, MOs, MI, *this);
+ return NewMI;
+ }
+ }
+
+ // No fusion
+ if (PrintFailedFusing)
+ cerr << "We failed to fuse operand " << i << " in " << *MI;
+ return NULL;
+}
+
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ int FrameIndex) const {
+ // Check switch flag
+ if (NoFusing) return NULL;
+
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+ // FIXME: Move alignment requirement into tables?
+ if (Alignment < 16) {
+ switch (MI->getOpcode()) {
+ default: break;
+ // Not always safe to fold movsd into these instructions since their load
+ // folding variants expects the address to be 16 byte aligned.
+ case X86::FsANDNPDrr:
+ case X86::FsANDNPSrr:
+ case X86::FsANDPDrr:
+ case X86::FsANDPSrr:
+ case X86::FsORPDrr:
+ case X86::FsORPSrr:
+ case X86::FsXORPDrr:
+ case X86::FsXORPSrr:
+ return NULL;
+ }
+ }
+
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ switch (MI->getOpcode()) {
+ default: return NULL;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
+ }
+ // Change to CMPXXri r, 0 first.
+ MI->setDesc(get(NewOpc));
+ MI->getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return NULL;
+
+ SmallVector<MachineOperand,4> MOs;
+ MOs.push_back(MachineOperand::CreateFI(FrameIndex));
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs);
+}
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ MachineInstr *LoadMI) const {
+ // Check switch flag
+ if (NoFusing) return NULL;
+
+ // Determine the alignment of the load.
+ unsigned Alignment = 0;
+ if (LoadMI->hasOneMemOperand())
+ Alignment = LoadMI->memoperands_begin()->getAlignment();
+
+ // FIXME: Move alignment requirement into tables?
+ if (Alignment < 16) {
+ switch (MI->getOpcode()) {
+ default: break;
+ // Not always safe to fold movsd into these instructions since their load
+ // folding variants expects the address to be 16 byte aligned.
+ case X86::FsANDNPDrr:
+ case X86::FsANDNPSrr:
+ case X86::FsANDPDrr:
+ case X86::FsANDPSrr:
+ case X86::FsORPDrr:
+ case X86::FsORPSrr:
+ case X86::FsXORPDrr:
+ case X86::FsXORPSrr:
+ return NULL;
+ }
+ }
+
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ switch (MI->getOpcode()) {
+ default: return NULL;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
+ }
+ // Change to CMPXXri r, 0 first.
+ MI->setDesc(get(NewOpc));
+ MI->getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return NULL;
+
+ SmallVector<MachineOperand,X86AddrNumOperands> MOs;
+ if (LoadMI->getOpcode() == X86::V_SET0 ||
+ LoadMI->getOpcode() == X86::V_SETALLONES) {
+ // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+ // Create a constant-pool entry and operands to load from it.
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ if (TM.getRelocationModel() == Reloc::PIC_ &&
+ !TM.getSubtarget<X86Subtarget>().is64Bit())
+ // FIXME: PICBase = TM.getInstrInfo()->getGlobalBaseReg(&MF);
+ // This doesn't work for several reasons.
+ // 1. GlobalBaseReg may have been spilled.
+ // 2. It may not be live at MI.
+ return false;
+
+ // Create a v4i32 constant-pool entry.
+ MachineConstantPool &MCP = *MF.getConstantPool();
+ const VectorType *Ty = VectorType::get(Type::Int32Ty, 4);
+ Constant *C = LoadMI->getOpcode() == X86::V_SET0 ?
+ ConstantVector::getNullValue(Ty) :
+ ConstantVector::getAllOnesValue(Ty);
+ unsigned CPI = MCP.getConstantPoolIndex(C, 16);
+
+ // Create operands to load from the constant pool entry.
+ MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+ MOs.push_back(MachineOperand::CreateImm(1));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ } else {
+ // Folding a normal load. Just copy the load's address operands.
+ unsigned NumOps = LoadMI->getDesc().getNumOperands();
+ for (unsigned i = NumOps - X86AddrNumOperands; i != NumOps; ++i)
+ MOs.push_back(LoadMI->getOperand(i));
+ }
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs);
+}
+
+
+bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops) const {
+ // Check switch flag
+ if (NoFusing) return 0;
+
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ switch (MI->getOpcode()) {
+ default: return false;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ return true;
+ }
+ }
+
+ if (Ops.size() != 1)
+ return false;
+
+ unsigned OpNum = Ops[0];
+ unsigned Opc = MI->getOpcode();
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ bool isTwoAddr = NumOps > 1 &&
+ MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ const DenseMap<unsigned*, unsigned> *OpcodeTablePtr = NULL;
+ if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+ } else if (OpNum == 0) { // If operand 0
+ switch (Opc) {
+ case X86::MOV16r0:
+ case X86::MOV32r0:
+ case X86::MOV64r0:
+ case X86::MOV8r0:
+ return true;
+ default: break;
+ }
+ OpcodeTablePtr = &RegOp2MemOpTable0;
+ } else if (OpNum == 1) {
+ OpcodeTablePtr = &RegOp2MemOpTable1;
+ } else if (OpNum == 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2;
+ }
+
+ if (OpcodeTablePtr) {
+ // Find the Opcode to fuse
+ DenseMap<unsigned*, unsigned>::iterator I =
+ OpcodeTablePtr->find((unsigned*)Opc);
+ if (I != OpcodeTablePtr->end())
+ return true;
+ }
+ return false;
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> >::iterator I =
+ MemOp2RegOpTable.find((unsigned*)MI->getOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ DebugLoc dl = MI->getDebugLoc();
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & 0xf;
+ bool FoldedLoad = I->second.second & (1 << 4);
+ bool FoldedStore = I->second.second & (1 << 5);
+ if (UnfoldLoad && !FoldedLoad)
+ return false;
+ UnfoldLoad &= FoldedLoad;
+ if (UnfoldStore && !FoldedStore)
+ return false;
+ UnfoldStore &= FoldedStore;
+
+ const TargetInstrDesc &TID = get(Opc);
+ const TargetOperandInfo &TOI = TID.OpInfo[Index];
+ const TargetRegisterClass *RC = TOI.isLookupPtrRegClass()
+ ? RI.getPointerRegClass() : RI.getRegClass(TOI.RegClass);
+ SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
+ SmallVector<MachineOperand,2> BeforeOps;
+ SmallVector<MachineOperand,2> AfterOps;
+ SmallVector<MachineOperand,4> ImpOps;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (i >= Index && i < Index + X86AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (Op.isReg() && Op.isImplicit())
+ ImpOps.push_back(Op);
+ else if (i < Index)
+ BeforeOps.push_back(Op);
+ else if (i > Index)
+ AfterOps.push_back(Op);
+ }
+
+ // Emit the load instruction.
+ if (UnfoldLoad) {
+ loadRegFromAddr(MF, Reg, AddrOps, RC, NewMIs);
+ if (UnfoldStore) {
+ // Address operands cannot be marked isKill.
+ for (unsigned i = 1; i != 1 + X86AddrNumOperands; ++i) {
+ MachineOperand &MO = NewMIs[0]->getOperand(i);
+ if (MO.isReg())
+ MO.setIsKill(false);
+ }
+ }
+ }
+
+ // Emit the data processing instruction.
+ MachineInstr *DataMI = MF.CreateMachineInstr(TID, MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(DataMI);
+
+ if (FoldedStore)
+ MIB.addReg(Reg, RegState::Define);
+ for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
+ MIB.addOperand(BeforeOps[i]);
+ if (FoldedLoad)
+ MIB.addReg(Reg);
+ for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
+ MIB.addOperand(AfterOps[i]);
+ for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
+ MachineOperand &MO = ImpOps[i];
+ MIB.addReg(MO.getReg(),
+ getDefRegState(MO.isDef()) |
+ RegState::Implicit |
+ getKillRegState(MO.isKill()) |
+ getDeadRegState(MO.isDead()));
+ }
+ // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+ unsigned NewOpc = 0;
+ switch (DataMI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP32ri:
+ case X86::CMP16ri:
+ case X86::CMP8ri: {
+ MachineOperand &MO0 = DataMI->getOperand(0);
+ MachineOperand &MO1 = DataMI->getOperand(1);
+ if (MO1.getImm() == 0) {
+ switch (DataMI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+ case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
+ case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
+ case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
+ }
+ DataMI->setDesc(get(NewOpc));
+ MO1.ChangeToRegister(MO0.getReg(), false);
+ }
+ }
+ }
+ NewMIs.push_back(DataMI);
+
+ // Emit the store instruction.
+ if (UnfoldStore) {
+ const TargetOperandInfo &DstTOI = TID.OpInfo[0];
+ const TargetRegisterClass *DstRC = DstTOI.isLookupPtrRegClass()
+ ? RI.getPointerRegClass() : RI.getRegClass(DstTOI.RegClass);
+ storeRegToAddr(MF, Reg, true, AddrOps, DstRC, NewMIs);
+ }
+
+ return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const {
+ if (!N->isMachineOpcode())
+ return false;
+
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> >::iterator I =
+ MemOp2RegOpTable.find((unsigned*)N->getMachineOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & 0xf;
+ bool FoldedLoad = I->second.second & (1 << 4);
+ bool FoldedStore = I->second.second & (1 << 5);
+ const TargetInstrDesc &TID = get(Opc);
+ const TargetOperandInfo &TOI = TID.OpInfo[Index];
+ const TargetRegisterClass *RC = TOI.isLookupPtrRegClass()
+ ? RI.getPointerRegClass() : RI.getRegClass(TOI.RegClass);
+ unsigned NumDefs = TID.NumDefs;
+ std::vector<SDValue> AddrOps;
+ std::vector<SDValue> BeforeOps;
+ std::vector<SDValue> AfterOps;
+ DebugLoc dl = N->getDebugLoc();
+ unsigned NumOps = N->getNumOperands();
+ for (unsigned i = 0; i != NumOps-1; ++i) {
+ SDValue Op = N->getOperand(i);
+ if (i >= Index-NumDefs && i < Index-NumDefs + X86AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (i < Index-NumDefs)
+ BeforeOps.push_back(Op);
+ else if (i > Index-NumDefs)
+ AfterOps.push_back(Op);
+ }
+ SDValue Chain = N->getOperand(NumOps-1);
+ AddrOps.push_back(Chain);
+
+ // Emit the load instruction.
+ SDNode *Load = 0;
+ const MachineFunction &MF = DAG.getMachineFunction();
+ if (FoldedLoad) {
+ MVT VT = *RC->vt_begin();
+ bool isAligned = (RI.getStackAlignment() >= 16) ||
+ RI.needsStackRealignment(MF);
+ Load = DAG.getTargetNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+ VT, MVT::Other, &AddrOps[0], AddrOps.size());
+ NewNodes.push_back(Load);
+ }
+
+ // Emit the data processing instruction.
+ std::vector<MVT> VTs;
+ const TargetRegisterClass *DstRC = 0;
+ if (TID.getNumDefs() > 0) {
+ const TargetOperandInfo &DstTOI = TID.OpInfo[0];
+ DstRC = DstTOI.isLookupPtrRegClass()
+ ? RI.getPointerRegClass() : RI.getRegClass(DstTOI.RegClass);
+ VTs.push_back(*DstRC->vt_begin());
+ }
+ for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+ MVT VT = N->getValueType(i);
+ if (VT != MVT::Other && i >= (unsigned)TID.getNumDefs())
+ VTs.push_back(VT);
+ }
+ if (Load)
+ BeforeOps.push_back(SDValue(Load, 0));
+ std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
+ SDNode *NewNode= DAG.getTargetNode(Opc, dl, VTs, &BeforeOps[0],
+ BeforeOps.size());
+ NewNodes.push_back(NewNode);
+
+ // Emit the store instruction.
+ if (FoldedStore) {
+ AddrOps.pop_back();
+ AddrOps.push_back(SDValue(NewNode, 0));
+ AddrOps.push_back(Chain);
+ bool isAligned = (RI.getStackAlignment() >= 16) ||
+ RI.needsStackRealignment(MF);
+ SDNode *Store = DAG.getTargetNode(getStoreRegOpcode(0, DstRC,
+ isAligned, TM),
+ dl, MVT::Other,
+ &AddrOps[0], AddrOps.size());
+ NewNodes.push_back(Store);
+ }
+
+ return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore) const {
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> >::iterator I =
+ MemOp2RegOpTable.find((unsigned*)Opc);
+ if (I == MemOp2RegOpTable.end())
+ return 0;
+ bool FoldedLoad = I->second.second & (1 << 4);
+ bool FoldedStore = I->second.second & (1 << 5);
+ if (UnfoldLoad && !FoldedLoad)
+ return 0;
+ if (UnfoldStore && !FoldedStore)
+ return 0;
+ return I->second.first;
+}
+
+bool X86InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
+ if (MBB.empty()) return false;
+
+ switch (MBB.back().getOpcode()) {
+ case X86::TCRETURNri:
+ case X86::TCRETURNdi:
+ case X86::RET: // Return.
+ case X86::RETI:
+ case X86::TAILJMPd:
+ case X86::TAILJMPr:
+ case X86::TAILJMPm:
+ case X86::JMP: // Uncond branch.
+ case X86::JMP32r: // Indirect branch.
+ case X86::JMP64r: // Indirect branch (64-bit).
+ case X86::JMP32m: // Indirect branch through mem.
+ case X86::JMP64m: // Indirect branch through mem (64-bit).
+ return true;
+ default: return false;
+ }
+}
+
+bool X86InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+ X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+ if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+ return true;
+ Cond[0].setImm(GetOppositeBranchCondition(CC));
+ return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+ // FIXME: Return false for x87 stack register classes for now. We can't
+ // allow any loads of these registers before FpGet_ST0_80.
+ return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+ RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+unsigned X86InstrInfo::sizeOfImm(const TargetInstrDesc *Desc) {
+ switch (Desc->TSFlags & X86II::ImmMask) {
+ case X86II::Imm8: return 1;
+ case X86II::Imm16: return 2;
+ case X86II::Imm32: return 4;
+ case X86II::Imm64: return 8;
+ default: assert(0 && "Immediate size not set!");
+ return 0;
+ }
+}
+
+/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended register?
+/// e.g. r8, xmm8, etc.
+bool X86InstrInfo::isX86_64ExtendedReg(const MachineOperand &MO) {
+ if (!MO.isReg()) return false;
+ switch (MO.getReg()) {
+ default: break;
+ case X86::R8: case X86::R9: case X86::R10: case X86::R11:
+ case X86::R12: case X86::R13: case X86::R14: case X86::R15:
+ case X86::R8D: case X86::R9D: case X86::R10D: case X86::R11D:
+ case X86::R12D: case X86::R13D: case X86::R14D: case X86::R15D:
+ case X86::R8W: case X86::R9W: case X86::R10W: case X86::R11W:
+ case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W:
+ case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B:
+ case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
+ case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
+ case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+ return true;
+ }
+ return false;
+}
+
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+unsigned X86InstrInfo::determineREX(const MachineInstr &MI) {
+ unsigned REX = 0;
+ const TargetInstrDesc &Desc = MI.getDesc();
+
+ // Pseudo instructions do not need REX prefix byte.
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
+ return 0;
+ if (Desc.TSFlags & X86II::REX_W)
+ REX |= 1 << 3;
+
+ unsigned NumOps = Desc.getNumOperands();
+ if (NumOps) {
+ bool isTwoAddr = NumOps > 1 &&
+ Desc.getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ unsigned i = isTwoAddr ? 1 : 0;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (isX86_64NonExtLowByteReg(Reg))
+ REX |= 0x40;
+ }
+ }
+
+ switch (Desc.TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg:
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= (1 << 0) | (1 << 2);
+ break;
+ case X86II::MRMSrcReg: {
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 2;
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << 0;
+ }
+ break;
+ }
+ case X86II::MRMSrcMem: {
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ case X86II::MRMDestMem: {
+ unsigned e = (isTwoAddr ? X86AddrNumOperands+1 : X86AddrNumOperands);
+ i = isTwoAddr ? 1 : 0;
+ if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ for (; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ default: {
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 0;
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << 2;
+ }
+ break;
+ }
+ }
+ }
+ return REX;
+}
+
+/// sizePCRelativeBlockAddress - This method returns the size of a PC
+/// relative block address instruction
+///
+static unsigned sizePCRelativeBlockAddress() {
+ return 4;
+}
+
+/// sizeGlobalAddress - Give the size of the emission of this global address
+///
+static unsigned sizeGlobalAddress(bool dword) {
+ return dword ? 8 : 4;
+}
+
+/// sizeConstPoolAddress - Give the size of the emission of this constant
+/// pool address
+///
+static unsigned sizeConstPoolAddress(bool dword) {
+ return dword ? 8 : 4;
+}
+
+/// sizeExternalSymbolAddress - Give the size of the emission of this external
+/// symbol
+///
+static unsigned sizeExternalSymbolAddress(bool dword) {
+ return dword ? 8 : 4;
+}
+
+/// sizeJumpTableAddress - Give the size of the emission of this jump
+/// table address
+///
+static unsigned sizeJumpTableAddress(bool dword) {
+ return dword ? 8 : 4;
+}
+
+static unsigned sizeConstant(unsigned Size) {
+ return Size;
+}
+
+static unsigned sizeRegModRMByte(){
+ return 1;
+}
+
+static unsigned sizeSIBByte(){
+ return 1;
+}
+
+static unsigned getDisplacementFieldSize(const MachineOperand *RelocOp) {
+ unsigned FinalSize = 0;
+ // If this is a simple integer displacement that doesn't require a relocation.
+ if (!RelocOp) {
+ FinalSize += sizeConstant(4);
+ return FinalSize;
+ }
+
+ // Otherwise, this is something that requires a relocation.
+ if (RelocOp->isGlobal()) {
+ FinalSize += sizeGlobalAddress(false);
+ } else if (RelocOp->isCPI()) {
+ FinalSize += sizeConstPoolAddress(false);
+ } else if (RelocOp->isJTI()) {
+ FinalSize += sizeJumpTableAddress(false);
+ } else {
+ assert(0 && "Unknown value to relocate!");
+ }
+ return FinalSize;
+}
+
+static unsigned getMemModRMByteSize(const MachineInstr &MI, unsigned Op,
+ bool IsPIC, bool Is64BitMode) {
+ const MachineOperand &Op3 = MI.getOperand(Op+3);
+ int DispVal = 0;
+ const MachineOperand *DispForReloc = 0;
+ unsigned FinalSize = 0;
+
+ // Figure out what sort of displacement we have to handle here.
+ if (Op3.isGlobal()) {
+ DispForReloc = &Op3;
+ } else if (Op3.isCPI()) {
+ if (Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal = 1;
+ }
+ } else if (Op3.isJTI()) {
+ if (Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal = 1;
+ }
+ } else {
+ DispVal = 1;
+ }
+
+ const MachineOperand &Base = MI.getOperand(Op);
+ const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+ unsigned BaseReg = Base.getReg();
+
+ // Is a SIB byte needed?
+ if ((!Is64BitMode || DispForReloc || BaseReg != 0) &&
+ IndexReg.getReg() == 0 &&
+ (BaseReg == 0 || X86RegisterInfo::getX86RegNum(BaseReg) != N86::ESP)) {
+ if (BaseReg == 0) { // Just a displacement?
+ // Emit special case [disp32] encoding
+ ++FinalSize;
+ FinalSize += getDisplacementFieldSize(DispForReloc);
+ } else {
+ unsigned BaseRegNo = X86RegisterInfo::getX86RegNum(BaseReg);
+ if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+ // Emit simple indirect register encoding... [EAX] f.e.
+ ++FinalSize;
+ // Be pessimistic and assume it's a disp32, not a disp8
+ } else {
+ // Emit the most general non-SIB encoding: [REG+disp32]
+ ++FinalSize;
+ FinalSize += getDisplacementFieldSize(DispForReloc);
+ }
+ }
+
+ } else { // We need a SIB byte, so start by outputting the ModR/M byte first
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ if (BaseReg == 0 || DispForReloc) {
+ // Emit the normal disp32 encoding.
+ ++FinalSize;
+ ForceDisp32 = true;
+ } else {
+ ++FinalSize;
+ }
+
+ FinalSize += sizeSIBByte();
+
+ // Do we need to output a displacement?
+ if (DispVal != 0 || ForceDisp32) {
+ FinalSize += getDisplacementFieldSize(DispForReloc);
+ }
+ }
+ return FinalSize;
+}
+
+
+static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
+ const TargetInstrDesc *Desc,
+ bool IsPIC, bool Is64BitMode) {
+
+ unsigned Opcode = Desc->Opcode;
+ unsigned FinalSize = 0;
+
+ // Emit the lock opcode prefix as needed.
+ if (Desc->TSFlags & X86II::LOCK) ++FinalSize;
+
+ // Emit segment override opcode prefix as needed.
+ switch (Desc->TSFlags & X86II::SegOvrMask) {
+ case X86II::FS:
+ case X86II::GS:
+ ++FinalSize;
+ break;
+ default: assert(0 && "Invalid segment!");
+ case 0: break; // No segment override!
+ }
+
+ // Emit the repeat opcode prefix as needed.
+ if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) ++FinalSize;
+
+ // Emit the operand size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::OpSize) ++FinalSize;
+
+ // Emit the address size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::AdSize) ++FinalSize;
+
+ bool Need0FPrefix = false;
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::TB: // Two-byte opcode prefix
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ Need0FPrefix = true;
+ break;
+ case X86II::REP: break; // already handled.
+ case X86II::XS: // F3 0F
+ ++FinalSize;
+ Need0FPrefix = true;
+ break;
+ case X86II::XD: // F2 0F
+ ++FinalSize;
+ Need0FPrefix = true;
+ break;
+ case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+ case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+ ++FinalSize;
+ break; // Two-byte opcode prefix
+ default: assert(0 && "Invalid prefix!");
+ case 0: break; // No prefix!
+ }
+
+ if (Is64BitMode) {
+ // REX prefix
+ unsigned REX = X86InstrInfo::determineREX(MI);
+ if (REX)
+ ++FinalSize;
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ if (Need0FPrefix)
+ ++FinalSize;
+
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::T8: // 0F 38
+ ++FinalSize;
+ break;
+ case X86II::TA: // 0F 3A
+ ++FinalSize;
+ break;
+ }
+
+ // If this is a two-address instruction, skip one of the register operands.
+ unsigned NumOps = Desc->getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+ CurOp++;
+ else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+ // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+ --NumOps;
+
+ switch (Desc->TSFlags & X86II::FormMask) {
+ default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!");
+ case X86II::Pseudo:
+ // Remember the current PC offset, this is the PIC relocation
+ // base address.
+ switch (Opcode) {
+ default:
+ break;
+ case TargetInstrInfo::INLINEASM: {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const char *AsmStr = MI.getOperand(0).getSymbolName();
+ const TargetAsmInfo* AI = MF->getTarget().getTargetAsmInfo();
+ FinalSize += AI->getInlineAsmLength(AsmStr);
+ break;
+ }
+ case TargetInstrInfo::DBG_LABEL:
+ case TargetInstrInfo::EH_LABEL:
+ break;
+ case TargetInstrInfo::IMPLICIT_DEF:
+ case TargetInstrInfo::DECLARE:
+ case X86::DWARF_LOC:
+ case X86::FP_REG_KILL:
+ break;
+ case X86::MOVPC32r: {
+ // This emits the "call" portion of this pseudo instruction.
+ ++FinalSize;
+ FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+ break;
+ }
+ }
+ CurOp = NumOps;
+ break;
+ case X86II::RawFrm:
+ ++FinalSize;
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+ if (MO.isMBB()) {
+ FinalSize += sizePCRelativeBlockAddress();
+ } else if (MO.isGlobal()) {
+ FinalSize += sizeGlobalAddress(false);
+ } else if (MO.isSymbol()) {
+ FinalSize += sizeExternalSymbolAddress(false);
+ } else if (MO.isImm()) {
+ FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+ } else {
+ assert(0 && "Unknown RawFrm operand!");
+ }
+ }
+ break;
+
+ case X86II::AddRegFrm:
+ ++FinalSize;
+ ++CurOp;
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+ if (MO1.isImm())
+ FinalSize += sizeConstant(Size);
+ else {
+ bool dword = false;
+ if (Opcode == X86::MOV64ri)
+ dword = true;
+ if (MO1.isGlobal()) {
+ FinalSize += sizeGlobalAddress(dword);
+ } else if (MO1.isSymbol())
+ FinalSize += sizeExternalSymbolAddress(dword);
+ else if (MO1.isCPI())
+ FinalSize += sizeConstPoolAddress(dword);
+ else if (MO1.isJTI())
+ FinalSize += sizeJumpTableAddress(dword);
+ }
+ }
+ break;
+
+ case X86II::MRMDestReg: {
+ ++FinalSize;
+ FinalSize += sizeRegModRMByte();
+ CurOp += 2;
+ if (CurOp != NumOps) {
+ ++CurOp;
+ FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+ }
+ break;
+ }
+ case X86II::MRMDestMem: {
+ ++FinalSize;
+ FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
+ CurOp += X86AddrNumOperands + 1;
+ if (CurOp != NumOps) {
+ ++CurOp;
+ FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+ }
+ break;
+ }
+
+ case X86II::MRMSrcReg:
+ ++FinalSize;
+ FinalSize += sizeRegModRMByte();
+ CurOp += 2;
+ if (CurOp != NumOps) {
+ ++CurOp;
+ FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+ }
+ break;
+
+ case X86II::MRMSrcMem: {
+ int AddrOperands;
+ if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+ Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+ AddrOperands = X86AddrNumOperands - 1; // No segment register
+ else
+ AddrOperands = X86AddrNumOperands;
+
+ ++FinalSize;
+ FinalSize += getMemModRMByteSize(MI, CurOp+1, IsPIC, Is64BitMode);
+ CurOp += AddrOperands + 1;
+ if (CurOp != NumOps) {
+ ++CurOp;
+ FinalSize += sizeConstant(X86InstrInfo::sizeOfImm(Desc));
+ }
+ break;
+ }
+
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ ++FinalSize;
+ if (Desc->getOpcode() == X86::LFENCE ||
+ Desc->getOpcode() == X86::MFENCE) {
+ // Special handling of lfence and mfence;
+ FinalSize += sizeRegModRMByte();
+ } else if (Desc->getOpcode() == X86::MONITOR ||
+ Desc->getOpcode() == X86::MWAIT) {
+ // Special handling of monitor and mwait.
+ FinalSize += sizeRegModRMByte() + 1; // +1 for the opcode.
+ } else {
+ ++CurOp;
+ FinalSize += sizeRegModRMByte();
+ }
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+ if (MO1.isImm())
+ FinalSize += sizeConstant(Size);
+ else {
+ bool dword = false;
+ if (Opcode == X86::MOV64ri32)
+ dword = true;
+ if (MO1.isGlobal()) {
+ FinalSize += sizeGlobalAddress(dword);
+ } else if (MO1.isSymbol())
+ FinalSize += sizeExternalSymbolAddress(dword);
+ else if (MO1.isCPI())
+ FinalSize += sizeConstPoolAddress(dword);
+ else if (MO1.isJTI())
+ FinalSize += sizeJumpTableAddress(dword);
+ }
+ }
+ break;
+
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+
+ ++FinalSize;
+ FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
+ CurOp += X86AddrNumOperands;
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+ unsigned Size = X86InstrInfo::sizeOfImm(Desc);
+ if (MO.isImm())
+ FinalSize += sizeConstant(Size);
+ else {
+ bool dword = false;
+ if (Opcode == X86::MOV64mi32)
+ dword = true;
+ if (MO.isGlobal()) {
+ FinalSize += sizeGlobalAddress(dword);
+ } else if (MO.isSymbol())
+ FinalSize += sizeExternalSymbolAddress(dword);
+ else if (MO.isCPI())
+ FinalSize += sizeConstPoolAddress(dword);
+ else if (MO.isJTI())
+ FinalSize += sizeJumpTableAddress(dword);
+ }
+ }
+ break;
+ }
+
+ case X86II::MRMInitReg:
+ ++FinalSize;
+ // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+ FinalSize += sizeRegModRMByte();
+ ++CurOp;
+ break;
+ }
+
+ if (!Desc->isVariadic() && CurOp != NumOps) {
+ cerr << "Cannot determine size: ";
+ MI.dump();
+ cerr << '\n';
+ abort();
+ }
+
+
+ return FinalSize;
+}
+
+
+unsigned X86InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+ const TargetInstrDesc &Desc = MI->getDesc();
+ bool IsPIC = (TM.getRelocationModel() == Reloc::PIC_);
+ bool Is64BitMode = TM.getSubtargetImpl()->is64Bit();
+ unsigned Size = GetInstSizeWithDesc(*MI, &Desc, IsPIC, Is64BitMode);
+ if (Desc.getOpcode() == X86::MOVPC32r) {
+ Size += GetInstSizeWithDesc(*MI, &get(X86::POP32r), IsPIC, Is64BitMode);
+ }
+ return Size;
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+ assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
+ "X86-64 PIC uses RIP relative addressing");
+
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+ if (GlobalBaseReg != 0)
+ return GlobalBaseReg;
+
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = MF->front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ DebugLoc DL = DebugLoc::getUnknownLoc();
+ if (MBBI != FirstMBB.end()) DL = MBBI->getDebugLoc();
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ unsigned PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+
+ const TargetInstrInfo *TII = TM.getInstrInfo();
+ // Operand of MovePCtoStack is completely ignored by asm printer. It's
+ // only used in JIT code emission as displacement to pc.
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC)
+ .addImm(0);
+
+ // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+ // not to pc, but to _GLOBAL_ADDRESS_TABLE_ external
+ if (TM.getRelocationModel() == Reloc::PIC_ &&
+ TM.getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+ GlobalBaseReg =
+ RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+ .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+ } else {
+ GlobalBaseReg = PC;
+ }
+
+ X86FI->setGlobalBaseReg(GlobalBaseReg);
+ return GlobalBaseReg;
+}
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..e09769e
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,461 @@
+//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+ class X86RegisterInfo;
+ class X86TargetMachine;
+
+namespace X86 {
+ // X86 specific condition code. These correspond to X86_*_COND in
+ // X86InstrInfo.td. They must be kept in synch.
+ enum CondCode {
+ COND_A = 0,
+ COND_AE = 1,
+ COND_B = 2,
+ COND_BE = 3,
+ COND_E = 4,
+ COND_G = 5,
+ COND_GE = 6,
+ COND_L = 7,
+ COND_LE = 8,
+ COND_NE = 9,
+ COND_NO = 10,
+ COND_NP = 11,
+ COND_NS = 12,
+ COND_O = 13,
+ COND_P = 14,
+ COND_S = 15,
+
+ // Artificial condition codes. These are used by AnalyzeBranch
+ // to indicate a block terminated with two conditional branches to
+ // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+ // which can't be represented on x86 with a single condition. These
+ // are never used in MachineInstrs.
+ COND_NE_OR_P,
+ COND_NP_OR_E,
+
+ COND_INVALID
+ };
+
+ // Turn condition code into conditional branch opcode.
+ unsigned GetCondBranchFromCond(CondCode CC);
+
+ /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+ /// e.g. turning COND_E to COND_NE.
+ CondCode GetOppositeBranchCondition(X86::CondCode CC);
+
+}
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+ enum {
+ //===------------------------------------------------------------------===//
+ // Instruction types. These are the standard/most common forms for X86
+ // instructions.
+ //
+
+ // PseudoFrm - This represents an instruction that is a pseudo instruction
+ // or one that has not been implemented yet. It is illegal to code generate
+ // it, but tolerated for intermediate implementation stages.
+ Pseudo = 0,
+
+ /// Raw - This form is for instructions that don't have any operands, so
+ /// they are just a fixed opcode value, like 'leave'.
+ RawFrm = 1,
+
+ /// AddRegFrm - This form is used for instructions like 'push r32' that have
+ /// their one register operand added to their opcode.
+ AddRegFrm = 2,
+
+ /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is a register.
+ ///
+ MRMDestReg = 3,
+
+ /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is memory.
+ ///
+ MRMDestMem = 4,
+
+ /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is a register.
+ ///
+ MRMSrcReg = 5,
+
+ /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is memory.
+ ///
+ MRMSrcMem = 6,
+
+ /// MRM[0-7][rm] - These forms are used to represent instructions that use
+ /// a Mod/RM byte, and use the middle field to hold extended opcode
+ /// information. In the intel manual these are represented as /0, /1, ...
+ ///
+
+ // First, instructions that operate on a register r/m operand...
+ MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3
+ MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7
+
+ // Next, instructions that operate on a memory r/m operand...
+ MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3
+ MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7
+
+ // MRMInitReg - This form is used for instructions whose source and
+ // destinations are the same register.
+ MRMInitReg = 32,
+
+ FormMask = 63,
+
+ //===------------------------------------------------------------------===//
+ // Actual flags...
+
+ // OpSize - Set if this instruction requires an operand size prefix (0x66),
+ // which most often indicates that the instruction operates on 16 bit data
+ // instead of 32 bit data.
+ OpSize = 1 << 6,
+
+ // AsSize - Set if this instruction requires an operand size prefix (0x67),
+ // which most often indicates that the instruction address 16 bit address
+ // instead of 32 bit address (or 32 bit address in 64 bit mode).
+ AdSize = 1 << 7,
+
+ //===------------------------------------------------------------------===//
+ // Op0Mask - There are several prefix bytes that are used to form two byte
+ // opcodes. These are currently 0x0F, 0xF3, and 0xD8-0xDF. This mask is
+ // used to obtain the setting of this field. If no bits in this field is
+ // set, there is no prefix byte for obtaining a multibyte opcode.
+ //
+ Op0Shift = 8,
+ Op0Mask = 0xF << Op0Shift,
+
+ // TB - TwoByte - Set if this instruction has a two byte opcode, which
+ // starts with a 0x0F byte before the real opcode.
+ TB = 1 << Op0Shift,
+
+ // REP - The 0xF3 prefix byte indicating repetition of the following
+ // instruction.
+ REP = 2 << Op0Shift,
+
+ // D8-DF - These escape opcodes are used by the floating point unit. These
+ // values must remain sequential.
+ D8 = 3 << Op0Shift, D9 = 4 << Op0Shift,
+ DA = 5 << Op0Shift, DB = 6 << Op0Shift,
+ DC = 7 << Op0Shift, DD = 8 << Op0Shift,
+ DE = 9 << Op0Shift, DF = 10 << Op0Shift,
+
+ // XS, XD - These prefix codes are for single and double precision scalar
+ // floating point operations performed in the SSE registers.
+ XD = 11 << Op0Shift, XS = 12 << Op0Shift,
+
+ // T8, TA - Prefix after the 0x0F prefix.
+ T8 = 13 << Op0Shift, TA = 14 << Op0Shift,
+
+ //===------------------------------------------------------------------===//
+ // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+ // They are used to specify GPRs and SSE registers, 64-bit operand size,
+ // etc. We only cares about REX.W and REX.R bits and only the former is
+ // statically determined.
+ //
+ REXShift = 12,
+ REX_W = 1 << REXShift,
+
+ //===------------------------------------------------------------------===//
+ // This three-bit field describes the size of an immediate operand. Zero is
+ // unused so that we can tell if we forgot to set a value.
+ ImmShift = 13,
+ ImmMask = 7 << ImmShift,
+ Imm8 = 1 << ImmShift,
+ Imm16 = 2 << ImmShift,
+ Imm32 = 3 << ImmShift,
+ Imm64 = 4 << ImmShift,
+
+ //===------------------------------------------------------------------===//
+ // FP Instruction Classification... Zero is non-fp instruction.
+
+ // FPTypeMask - Mask for all of the FP types...
+ FPTypeShift = 16,
+ FPTypeMask = 7 << FPTypeShift,
+
+ // NotFP - The default, set for instructions that do not use FP registers.
+ NotFP = 0 << FPTypeShift,
+
+ // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+ ZeroArgFP = 1 << FPTypeShift,
+
+ // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+ OneArgFP = 2 << FPTypeShift,
+
+ // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+ // result back to ST(0). For example, fcos, fsqrt, etc.
+ //
+ OneArgFPRW = 3 << FPTypeShift,
+
+ // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+ // explicit argument, storing the result to either ST(0) or the implicit
+ // argument. For example: fadd, fsub, fmul, etc...
+ TwoArgFP = 4 << FPTypeShift,
+
+ // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+ // explicit argument, but have no destination. Example: fucom, fucomi, ...
+ CompareFP = 5 << FPTypeShift,
+
+ // CondMovFP - "2 operand" floating point conditional move instructions.
+ CondMovFP = 6 << FPTypeShift,
+
+ // SpecialFP - Special instruction forms. Dispatch by opcode explicitly.
+ SpecialFP = 7 << FPTypeShift,
+
+ // Lock prefix
+ LOCKShift = 19,
+ LOCK = 1 << LOCKShift,
+
+ // Segment override prefixes. Currently we just need ability to address
+ // stuff in gs and fs segments.
+ SegOvrShift = 20,
+ SegOvrMask = 3 << SegOvrShift,
+ FS = 1 << SegOvrShift,
+ GS = 2 << SegOvrShift,
+
+ // Bits 22 -> 23 are unused
+ OpcodeShift = 24,
+ OpcodeMask = 0xFF << OpcodeShift
+ };
+}
+
+const int X86AddrNumOperands = 5;
+
+inline static bool isScale(const MachineOperand &MO) {
+ return MO.isImm() &&
+ (MO.getImm() == 1 || MO.getImm() == 2 ||
+ MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
+ if (MI->getOperand(Op).isFI()) return true;
+ return Op+4 <= MI->getNumOperands() &&
+ MI->getOperand(Op ).isReg() && isScale(MI->getOperand(Op+1)) &&
+ MI->getOperand(Op+2).isReg() &&
+ (MI->getOperand(Op+3).isImm() ||
+ MI->getOperand(Op+3).isGlobal() ||
+ MI->getOperand(Op+3).isCPI() ||
+ MI->getOperand(Op+3).isJTI());
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+ if (MI->getOperand(Op).isFI()) return true;
+ return Op+5 <= MI->getNumOperands() &&
+ MI->getOperand(Op+4).isReg() &&
+ isLeaMem(MI, Op);
+}
+
+class X86InstrInfo : public TargetInstrInfoImpl {
+ X86TargetMachine &TM;
+ const X86RegisterInfo RI;
+
+ /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+ /// RegOp2MemOpTable2 - Load / store folding opcode maps.
+ ///
+ DenseMap<unsigned*, unsigned> RegOp2MemOpTable2Addr;
+ DenseMap<unsigned*, unsigned> RegOp2MemOpTable0;
+ DenseMap<unsigned*, unsigned> RegOp2MemOpTable1;
+ DenseMap<unsigned*, unsigned> RegOp2MemOpTable2;
+
+ /// MemOp2RegOpTable - Load / store unfolding opcode map.
+ ///
+ DenseMap<unsigned*, std::pair<unsigned, unsigned> > MemOp2RegOpTable;
+
+public:
+ explicit X86InstrInfo(X86TargetMachine &tm);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ virtual const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+ /// Return true if the instruction is a register to register move and return
+ /// the source and dest operands and their sub-register indices by reference.
+ virtual bool isMoveInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+ unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+ unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+
+ bool isReallyTriviallyReMaterializable(const MachineInstr *MI) const;
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned DestReg, const MachineInstr *Orig) const;
+
+ bool isInvariantLoad(const MachineInstr *MI) const;
+
+ /// convertToThreeAddress - This method must be implemented by targets that
+ /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+ /// may be able to convert a two-address instruction into a true
+ /// three-address instruction on demand. This allows the X86 target (for
+ /// example) to convert ADD and SHL instructions into LEA instructions if they
+ /// would require register copies due to two-addressness.
+ ///
+ /// This method returns a null pointer if the transformation cannot be
+ /// performed, otherwise it returns the new instruction.
+ ///
+ virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const;
+
+ /// commuteInstruction - We have a few instructions that must be hacked on to
+ /// commute them.
+ ///
+ virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+
+ // Branch analysis.
+ virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
+ virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const;
+ virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+ virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond) const;
+ virtual bool copyRegToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *DestRC,
+ const TargetRegisterClass *SrcRC) const;
+ virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC) const;
+
+ virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC) const;
+
+ virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI) const;
+
+ virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI) const;
+
+ /// foldMemoryOperand - If this target supports it, fold a load or store of
+ /// the specified stack slot into the specified machine instruction for the
+ /// specified operand(s). If this is possible, the target should perform the
+ /// folding and return true, otherwise it should return false. If it folds
+ /// the instruction, it is likely that the MachineInstruction the iterator
+ /// references has been changed.
+ virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ int FrameIndex) const;
+
+ /// foldMemoryOperand - Same as the previous version except it allows folding
+ /// of any load and store from / to any address, not just from a specific
+ /// stack slot.
+ virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ MachineInstr* LoadMI) const;
+
+ /// canFoldMemoryOperand - Returns true if the specified load / store is
+ /// folding is possible.
+ virtual bool canFoldMemoryOperand(const MachineInstr*,
+ const SmallVectorImpl<unsigned> &) const;
+
+ /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+ /// a store or a load and a store into two or more instruction. If this is
+ /// possible, returns true as well as the new instructions by reference.
+ virtual bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const;
+
+ /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+ /// instruction after load / store are unfolded from an instruction of the
+ /// specified opcode. It returns zero if the specified unfolding is not
+ /// possible.
+ virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore) const;
+
+ virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+ virtual
+ bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+ /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+ /// instruction that defines the specified register class.
+ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+ // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+ // specified machine instruction.
+ //
+ unsigned char getBaseOpcodeFor(const TargetInstrDesc *TID) const {
+ return TID->TSFlags >> X86II::OpcodeShift;
+ }
+ unsigned char getBaseOpcodeFor(unsigned Opcode) const {
+ return getBaseOpcodeFor(&get(Opcode));
+ }
+
+ static bool isX86_64NonExtLowByteReg(unsigned reg) {
+ return (reg == X86::SPL || reg == X86::BPL ||
+ reg == X86::SIL || reg == X86::DIL);
+ }
+
+ static unsigned sizeOfImm(const TargetInstrDesc *Desc);
+ static bool isX86_64ExtendedReg(const MachineOperand &MO);
+ static unsigned determineREX(const MachineInstr &MI);
+
+ /// GetInstSize - Returns the size of the specified MachineInstr.
+ ///
+ virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+ /// getGlobalBaseReg - Return a virtual register initialized with the
+ /// the global base register value. Output instructions required to
+ /// initialize the register in the function entry block, if necessary.
+ ///
+ unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+private:
+ MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ unsigned OpNum,
+ const SmallVectorImpl<MachineOperand> &MOs) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..50ae417
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,3961 @@
+//===- X86InstrInfo.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDTX86Cmov : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<1, 1,
+ [SDTCisInt<0>]>;
+def SDTBinaryArithWithFlags : SDTypeProfile<1, 2,
+ [SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisInt<0>]>;
+def SDTX86BrCond : SDTypeProfile<0, 3,
+ [SDTCisVT<0, OtherVT>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i8>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+ SDTCisVT<2, i8>]>;
+def SDTX86cas8 : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
+def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_X86CallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
+
+def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86RdTsc : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86SegmentBaseAddress : SDTypeProfile<1, 1, [SDTCisPtrTy<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def X86bsf : SDNode<"X86ISD::BSF", SDTIntUnaryOp>;
+def X86bsr : SDNode<"X86ISD::BSR", SDTIntUnaryOp>;
+def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
+def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
+
+def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
+
+def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
+
+def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
+def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
+ [SDNPHasChain]>;
+def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+ SDNPMayLoad]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+ SDNPMayLoad]>;
+def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInFlag]>;
+
+def X86callseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+ [SDNPHasChain, SDNPOutFlag]>;
+def X86callseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
+ [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86tailcall: SDNode<"X86ISD::TAILCALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+ SDNPMayLoad]>;
+
+def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG",SDTX86RdTsc,
+ [SDNPHasChain, SDNPOutFlag, SDNPSideEffect]>;
+
+def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+ [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def X86SegmentBaseAddress : SDNode<"X86ISD::SegmentBaseAddress",
+ SDT_X86SegmentBaseAddress, []>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+ [SDNPHasChain]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+ [SDNPHasChain, SDNPOptInFlag]>;
+
+def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags>;
+def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDTUnaryArithWithFlags>;
+def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>;
+def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+ let PrintMethod = printMethod;
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm, i8imm);
+}
+
+def i8mem : X86MemOperand<"printi8mem">;
+def i16mem : X86MemOperand<"printi16mem">;
+def i32mem : X86MemOperand<"printi32mem">;
+def i64mem : X86MemOperand<"printi64mem">;
+def i128mem : X86MemOperand<"printi128mem">;
+def f32mem : X86MemOperand<"printf32mem">;
+def f64mem : X86MemOperand<"printf64mem">;
+def f80mem : X86MemOperand<"printf80mem">;
+def f128mem : X86MemOperand<"printf128mem">;
+
+// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
+// plain GR64, so that it doesn't potentially require a REX prefix.
+def i8mem_NOREX : Operand<i64> {
+ let PrintMethod = "printi8mem";
+ let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX, i32imm, i8imm);
+}
+
+def lea32mem : Operand<i32> {
+ let PrintMethod = "printlea32mem";
+ let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+def SSECC : Operand<i8> {
+ let PrintMethod = "printSSECC";
+}
+
+def piclabel: Operand<i32> {
+ let PrintMethod = "printPICLabel";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm : Operand<i16>;
+// 32-bits but only 8 bits are significant.
+def i32i8imm : Operand<i32>;
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], []>;
+def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
+ [add, sub, mul, shl, or, frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasMMX : Predicate<"Subtarget->hasMMX()">;
+def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
+def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
+def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
+def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
+def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
+def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
+def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
+def In32BitMode : Predicate<"!Subtarget->is64Bit()">;
+def In64BitMode : Predicate<"Subtarget->is64Bit()">;
+def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">;
+def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def OptForSpeed : Predicate<"!OptForSize">;
+def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE
+def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC
+def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
+def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA
+def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
+def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE
+def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL
+def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE
+def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG
+def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ
+def X86_COND_NO : PatLeaf<(i8 10)>;
+def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS : PatLeaf<(i8 12)>;
+def X86_COND_O : PatLeaf<(i8 13)>;
+def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S : PatLeaf<(i8 15)>;
+
+def i16immSExt8 : PatLeaf<(i16 imm), [{
+ // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit
+ // sign extended field.
+ return (int16_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}]>;
+
+def i32immSExt8 : PatLeaf<(i32 imm), [{
+ // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
+ // sign extended field.
+ return (int32_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (const Value *Src = LD->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (const Value *Src = LD->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (const Value *Src = LD->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 4 && !LD->isVolatile();
+ return false;
+}]>;
+
+def nvloadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (const Value *Src = LD->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ if (LD->isVolatile())
+ return false;
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 4;
+ return false;
+}]>;
+
+def gsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ return PT->getAddressSpace() == 256;
+ return false;
+}]>;
+
+def fsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ return PT->getAddressSpace() == 257;
+ return false;
+}]>;
+
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr)), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ return true;
+}]>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr)), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ return true;
+}]>;
+
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr)), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ return true;
+}]>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr)), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ return true;
+}]>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr)), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ return true;
+}]>;
+
+def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+
+def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
+def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+
+def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
+def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+ return N->hasOneUse();
+}]>;
+
+// 'shld' and 'shrd' instruction patterns. Note that even though these have
+// the srl and shl in their patterns, the C++ code must still check for them,
+// because predicates are tested before children nodes are explored.
+
+def shrd : PatFrag<(ops node:$src1, node:$amt1, node:$src2, node:$amt2),
+ (or (srl node:$src1, node:$amt1),
+ (shl node:$src2, node:$amt2)), [{
+ assert(N->getOpcode() == ISD::OR);
+ return N->getOperand(0).getOpcode() == ISD::SRL &&
+ N->getOperand(1).getOpcode() == ISD::SHL &&
+ isa<ConstantSDNode>(N->getOperand(0).getOperand(1)) &&
+ isa<ConstantSDNode>(N->getOperand(1).getOperand(1)) &&
+ N->getOperand(0).getConstantOperandVal(1) ==
+ N->getValueSizeInBits(0) - N->getOperand(1).getConstantOperandVal(1);
+}]>;
+
+def shld : PatFrag<(ops node:$src1, node:$amt1, node:$src2, node:$amt2),
+ (or (shl node:$src1, node:$amt1),
+ (srl node:$src2, node:$amt2)), [{
+ assert(N->getOpcode() == ISD::OR);
+ return N->getOperand(0).getOpcode() == ISD::SHL &&
+ N->getOperand(1).getOpcode() == ISD::SRL &&
+ isa<ConstantSDNode>(N->getOperand(0).getOperand(1)) &&
+ isa<ConstantSDNode>(N->getOperand(1).getOperand(1)) &&
+ N->getOperand(0).getConstantOperandVal(1) ==
+ N->getValueSizeInBits(0) - N->getOperand(1).getConstantOperandVal(1);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+ "#ADJCALLSTACKDOWN",
+ [(X86callseq_start timm:$amt)]>,
+ Requires<[In32BitMode]>;
+def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[In32BitMode]>;
+}
+
+// Nop
+let neverHasSideEffects = 1 in
+ def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
+
+// PIC base
+let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
+ def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins piclabel:$label),
+ "call\t$label\n\t"
+ "pop{l}\t$reg", []>;
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions...
+//
+
+// Return instructions.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, FPForm = SpecialFP, FPFormBits = SpecialFP.Value in {
+ def RET : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ "ret",
+ [(X86retflag 0)]>;
+ def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ "ret\t$amt",
+ [(X86retflag imm:$amt)]>;
+}
+
+// All branches are RawFrm, Void, Branch, and Terminators
+let isBranch = 1, isTerminator = 1 in
+ class IBr<bits<8> opcode, dag ins, string asm, list<dag> pattern> :
+ I<opcode, RawFrm, (outs), ins, asm, pattern>;
+
+let isBranch = 1, isBarrier = 1 in
+ def JMP : IBr<0xE9, (ins brtarget:$dst), "jmp\t$dst", [(br bb:$dst)]>;
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+ [(brind GR32:$dst)]>;
+ def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+ [(brind (loadi32 addr:$dst))]>;
+}
+
+// Conditional branches
+let Uses = [EFLAGS] in {
+def JE : IBr<0x84, (ins brtarget:$dst), "je\t$dst",
+ [(X86brcond bb:$dst, X86_COND_E, EFLAGS)]>, TB;
+def JNE : IBr<0x85, (ins brtarget:$dst), "jne\t$dst",
+ [(X86brcond bb:$dst, X86_COND_NE, EFLAGS)]>, TB;
+def JL : IBr<0x8C, (ins brtarget:$dst), "jl\t$dst",
+ [(X86brcond bb:$dst, X86_COND_L, EFLAGS)]>, TB;
+def JLE : IBr<0x8E, (ins brtarget:$dst), "jle\t$dst",
+ [(X86brcond bb:$dst, X86_COND_LE, EFLAGS)]>, TB;
+def JG : IBr<0x8F, (ins brtarget:$dst), "jg\t$dst",
+ [(X86brcond bb:$dst, X86_COND_G, EFLAGS)]>, TB;
+def JGE : IBr<0x8D, (ins brtarget:$dst), "jge\t$dst",
+ [(X86brcond bb:$dst, X86_COND_GE, EFLAGS)]>, TB;
+
+def JB : IBr<0x82, (ins brtarget:$dst), "jb\t$dst",
+ [(X86brcond bb:$dst, X86_COND_B, EFLAGS)]>, TB;
+def JBE : IBr<0x86, (ins brtarget:$dst), "jbe\t$dst",
+ [(X86brcond bb:$dst, X86_COND_BE, EFLAGS)]>, TB;
+def JA : IBr<0x87, (ins brtarget:$dst), "ja\t$dst",
+ [(X86brcond bb:$dst, X86_COND_A, EFLAGS)]>, TB;
+def JAE : IBr<0x83, (ins brtarget:$dst), "jae\t$dst",
+ [(X86brcond bb:$dst, X86_COND_AE, EFLAGS)]>, TB;
+
+def JS : IBr<0x88, (ins brtarget:$dst), "js\t$dst",
+ [(X86brcond bb:$dst, X86_COND_S, EFLAGS)]>, TB;
+def JNS : IBr<0x89, (ins brtarget:$dst), "jns\t$dst",
+ [(X86brcond bb:$dst, X86_COND_NS, EFLAGS)]>, TB;
+def JP : IBr<0x8A, (ins brtarget:$dst), "jp\t$dst",
+ [(X86brcond bb:$dst, X86_COND_P, EFLAGS)]>, TB;
+def JNP : IBr<0x8B, (ins brtarget:$dst), "jnp\t$dst",
+ [(X86brcond bb:$dst, X86_COND_NP, EFLAGS)]>, TB;
+def JO : IBr<0x80, (ins brtarget:$dst), "jo\t$dst",
+ [(X86brcond bb:$dst, X86_COND_O, EFLAGS)]>, TB;
+def JNO : IBr<0x81, (ins brtarget:$dst), "jno\t$dst",
+ [(X86brcond bb:$dst, X86_COND_NO, EFLAGS)]>, TB;
+} // Uses = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. ESP is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [ESP] in {
+ def CALLpcrel32 : Ii32<0xE8, RawFrm, (outs), (ins i32imm:$dst,variable_ops),
+ "call\t${dst:call}", []>;
+ def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
+ "call\t{*}$dst", [(X86call GR32:$dst)]>;
+ def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
+ "call\t{*}$dst", [(X86call (loadi32 addr:$dst))]>;
+ }
+
+// Tail call stuff.
+
+def TAILCALL : I<0, Pseudo, (outs), (ins),
+ "#TAILCALL",
+ []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNdi : I<0, Pseudo, (outs), (ins i32imm:$dst, i32imm:$offset, variable_ops),
+ "#TC_RETURN $dst $offset",
+ []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNri : I<0, Pseudo, (outs), (ins GR32:$dst, i32imm:$offset, variable_ops),
+ "#TC_RETURN $dst $offset",
+ []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+
+ def TAILJMPd : IBr<0xE9, (ins i32imm:$dst), "jmp\t${dst:call} # TAILCALL",
+ []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+ def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst # TAILCALL",
+ []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+ def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem:$dst),
+ "jmp\t{*}$dst # TAILCALL", []>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions...
+//
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
+def LEAVE : I<0xC9, RawFrm,
+ (outs), (ins), "leave", []>;
+
+let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
+let mayLoad = 1 in
+def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
+
+let mayStore = 1 in
+def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
+}
+
+let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, neverHasSideEffects=1 in
+def POPFD : I<0x9D, RawFrm, (outs), (ins), "popf", []>;
+let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
+def PUSHFD : I<0x9C, RawFrm, (outs), (ins), "pushf", []>;
+
+let isTwoAddress = 1 in // GR32 = bswap GR32
+ def BSWAP32r : I<0xC8, AddRegFrm,
+ (outs GR32:$dst), (ins GR32:$src),
+ "bswap{l}\t$dst",
+ [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (X86bsf GR16:$src)), (implicit EFLAGS)]>, TB;
+def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (X86bsf (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, TB;
+def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (X86bsf GR32:$src)), (implicit EFLAGS)]>, TB;
+def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (X86bsf (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, TB;
+
+def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (X86bsr GR16:$src)), (implicit EFLAGS)]>, TB;
+def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (X86bsr (loadi16 addr:$src))),
+ (implicit EFLAGS)]>, TB;
+def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (X86bsr GR32:$src)), (implicit EFLAGS)]>, TB;
+def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (X86bsr (loadi32 addr:$src))),
+ (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+let neverHasSideEffects = 1 in
+def LEA16r : I<0x8D, MRMSrcMem,
+ (outs GR16:$dst), (ins i32mem:$src),
+ "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize;
+let isReMaterializable = 1 in
+def LEA32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins lea32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;
+
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI] in {
+def REP_MOVSB : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+ [(X86rep_movs i8)]>, REP;
+def REP_MOVSW : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+ [(X86rep_movs i16)]>, REP, OpSize;
+def REP_MOVSD : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+ [(X86rep_movs i32)]>, REP;
+}
+
+let Defs = [ECX,EDI], Uses = [AL,ECX,EDI] in
+def REP_STOSB : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+ [(X86rep_stos i8)]>, REP;
+let Defs = [ECX,EDI], Uses = [AX,ECX,EDI] in
+def REP_STOSW : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+ [(X86rep_stos i16)]>, REP, OpSize;
+let Defs = [ECX,EDI], Uses = [EAX,ECX,EDI] in
+def REP_STOSD : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+ [(X86rep_stos i32)]>, REP;
+
+let Defs = [RAX, RDX] in
+def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>,
+ TB;
+
+let isBarrier = 1, hasCtrlDep = 1 in {
+def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// Input/Output Instructions...
+//
+let Defs = [AL], Uses = [DX] in
+def IN8rr : I<0xEC, RawFrm, (outs), (ins),
+ "in{b}\t{%dx, %al|%AL, %DX}", []>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+ "in{w}\t{%dx, %ax|%AX, %DX}", []>, OpSize;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+ "in{l}\t{%dx, %eax|%EAX, %DX}", []>;
+
+let Defs = [AL] in
+def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i16i8imm:$port),
+ "in{b}\t{$port, %al|%AL, $port}", []>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i16i8imm:$port),
+ "in{w}\t{$port, %ax|%AX, $port}", []>, OpSize;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i16i8imm:$port),
+ "in{l}\t{$port, %eax|%EAX, $port}", []>;
+
+let Uses = [DX, AL] in
+def OUT8rr : I<0xEE, RawFrm, (outs), (ins),
+ "out{b}\t{%al, %dx|%DX, %AL}", []>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{w}\t{%ax, %dx|%DX, %AX}", []>, OpSize;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{l}\t{%eax, %dx|%DX, %EAX}", []>;
+
+let Uses = [AL] in
+def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i16i8imm:$port),
+ "out{b}\t{%al, $port|$port, %AL}", []>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i16i8imm:$port),
+ "out{w}\t{%ax, $port|$port, %AX}", []>, OpSize;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i16i8imm:$port),
+ "out{l}\t{%eax, $port|$port, %EAX}", []>;
+
+//===----------------------------------------------------------------------===//
+// Move Instructions...
+//
+let neverHasSideEffects = 1 in {
+def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", []>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>;
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, imm:$src)]>, OpSize;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, imm:$src)]>;
+}
+def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store (i32 imm:$src), addr:$dst)]>;
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, (loadi8 addr:$src))]>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (loadi32 addr:$src))]>;
+}
+
+def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store GR16:$src, addr:$dst)]>, OpSize;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store GR32:$src, addr:$dst)]>;
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let neverHasSideEffects = 1 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+ (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
+let mayStore = 1 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+ (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
+let mayLoad = 1,
+ canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+ (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
+
+//===----------------------------------------------------------------------===//
+// Fixed-Register Multiplication and Division Instructions...
+//
+
+// Extra precision multiplication
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, GR8:$src)),
+ (implicit EFLAGS)]>; // AL,AH = AL*GR8
+
+let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
+def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src),
+ "mul{w}\t$src",
+ []>, OpSize; // AX,DX = AX*GR16
+
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
+def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src),
+ "mul{l}\t$src",
+ []>; // EAX,EDX = EAX*GR32
+
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+ "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, (loadi8 addr:$src))),
+ (implicit EFLAGS)]>; // AL,AH = AL*[mem8]
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+ "mul{w}\t$src",
+ []>, OpSize; // AX,DX = AX*[mem16]
+
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+ "mul{l}\t$src",
+ []>; // EAX,EDX = EAX*[mem32]
+}
+
+let neverHasSideEffects = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", []>;
+ // AL,AH = AL*GR8
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", []>,
+ OpSize; // AX,DX = AX*GR16
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", []>;
+ // EAX,EDX = EAX*GR32
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+ "imul{b}\t$src", []>; // AL,AH = AL*[mem8]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+ "imul{w}\t$src", []>, OpSize; // AX,DX = AX*[mem16]
+let Defs = [EAX,EDX], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+ "imul{l}\t$src", []>; // EAX,EDX = EAX*[mem32]
+}
+} // neverHasSideEffects
+
+// unsigned division/remainder
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "div{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "div{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "div{l}\t$src", []>;
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "div{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "div{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src), // EDX:EAX/[mem32] = EAX,EDX
+ "div{l}\t$src", []>;
+}
+
+// Signed division/remainder.
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "idiv{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "idiv{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "idiv{l}\t$src", []>;
+let mayLoad = 1, mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "idiv{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "idiv{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), // EDX:EAX/[mem32] = EAX,EDX
+ "idiv{l}\t$src", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Two address Instructions.
+//
+let isTwoAddress = 1 in {
+
+// Conditional moves
+let Uses = [EFLAGS] in {
+let isCommutable = 1 in {
+def CMOVB16rr : I<0x42, MRMSrcReg, // if <u, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovb\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_B, EFLAGS))]>,
+ TB, OpSize;
+def CMOVB32rr : I<0x42, MRMSrcReg, // if <u, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovb\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_B, EFLAGS))]>,
+ TB;
+def CMOVAE16rr: I<0x43, MRMSrcReg, // if >=u, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovae\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_AE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVAE32rr: I<0x43, MRMSrcReg, // if >=u, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovae\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_AE, EFLAGS))]>,
+ TB;
+def CMOVE16rr : I<0x44, MRMSrcReg, // if ==, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmove\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_E, EFLAGS))]>,
+ TB, OpSize;
+def CMOVE32rr : I<0x44, MRMSrcReg, // if ==, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmove\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_E, EFLAGS))]>,
+ TB;
+def CMOVNE16rr: I<0x45, MRMSrcReg, // if !=, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovne\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNE32rr: I<0x45, MRMSrcReg, // if !=, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovne\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NE, EFLAGS))]>,
+ TB;
+def CMOVBE16rr: I<0x46, MRMSrcReg, // if <=u, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovbe\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_BE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVBE32rr: I<0x46, MRMSrcReg, // if <=u, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovbe\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_BE, EFLAGS))]>,
+ TB;
+def CMOVA16rr : I<0x47, MRMSrcReg, // if >u, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmova\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_A, EFLAGS))]>,
+ TB, OpSize;
+def CMOVA32rr : I<0x47, MRMSrcReg, // if >u, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmova\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_A, EFLAGS))]>,
+ TB;
+def CMOVL16rr : I<0x4C, MRMSrcReg, // if <s, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovl\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_L, EFLAGS))]>,
+ TB, OpSize;
+def CMOVL32rr : I<0x4C, MRMSrcReg, // if <s, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovl\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_L, EFLAGS))]>,
+ TB;
+def CMOVGE16rr: I<0x4D, MRMSrcReg, // if >=s, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovge\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_GE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVGE32rr: I<0x4D, MRMSrcReg, // if >=s, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovge\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_GE, EFLAGS))]>,
+ TB;
+def CMOVLE16rr: I<0x4E, MRMSrcReg, // if <=s, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovle\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_LE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVLE32rr: I<0x4E, MRMSrcReg, // if <=s, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovle\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_LE, EFLAGS))]>,
+ TB;
+def CMOVG16rr : I<0x4F, MRMSrcReg, // if >s, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovg\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_G, EFLAGS))]>,
+ TB, OpSize;
+def CMOVG32rr : I<0x4F, MRMSrcReg, // if >s, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovg\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_G, EFLAGS))]>,
+ TB;
+def CMOVS16rr : I<0x48, MRMSrcReg, // if signed, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovs\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_S, EFLAGS))]>,
+ TB, OpSize;
+def CMOVS32rr : I<0x48, MRMSrcReg, // if signed, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovs\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_S, EFLAGS))]>,
+ TB;
+def CMOVNS16rr: I<0x49, MRMSrcReg, // if !signed, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovns\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NS, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNS32rr: I<0x49, MRMSrcReg, // if !signed, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovns\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NS, EFLAGS))]>,
+ TB;
+def CMOVP16rr : I<0x4A, MRMSrcReg, // if parity, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovp\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_P, EFLAGS))]>,
+ TB, OpSize;
+def CMOVP32rr : I<0x4A, MRMSrcReg, // if parity, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovp\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_P, EFLAGS))]>,
+ TB;
+def CMOVNP16rr : I<0x4B, MRMSrcReg, // if !parity, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovnp\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NP, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNP32rr : I<0x4B, MRMSrcReg, // if !parity, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovnp\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NP, EFLAGS))]>,
+ TB;
+def CMOVO16rr : I<0x40, MRMSrcReg, // if overflow, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovo\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_O, EFLAGS))]>,
+ TB, OpSize;
+def CMOVO32rr : I<0x40, MRMSrcReg, // if overflow, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovo\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_O, EFLAGS))]>,
+ TB;
+def CMOVNO16rr : I<0x41, MRMSrcReg, // if !overflow, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovno\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NO, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNO32rr : I<0x41, MRMSrcReg, // if !overflow, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovno\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NO, EFLAGS))]>,
+ TB;
+} // isCommutable = 1
+
+def CMOVB16rm : I<0x42, MRMSrcMem, // if <u, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovb\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_B, EFLAGS))]>,
+ TB, OpSize;
+def CMOVB32rm : I<0x42, MRMSrcMem, // if <u, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovb\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_B, EFLAGS))]>,
+ TB;
+def CMOVAE16rm: I<0x43, MRMSrcMem, // if >=u, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovae\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_AE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVAE32rm: I<0x43, MRMSrcMem, // if >=u, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovae\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_AE, EFLAGS))]>,
+ TB;
+def CMOVE16rm : I<0x44, MRMSrcMem, // if ==, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmove\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_E, EFLAGS))]>,
+ TB, OpSize;
+def CMOVE32rm : I<0x44, MRMSrcMem, // if ==, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmove\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_E, EFLAGS))]>,
+ TB;
+def CMOVNE16rm: I<0x45, MRMSrcMem, // if !=, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovne\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNE32rm: I<0x45, MRMSrcMem, // if !=, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovne\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NE, EFLAGS))]>,
+ TB;
+def CMOVBE16rm: I<0x46, MRMSrcMem, // if <=u, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovbe\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_BE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVBE32rm: I<0x46, MRMSrcMem, // if <=u, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovbe\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_BE, EFLAGS))]>,
+ TB;
+def CMOVA16rm : I<0x47, MRMSrcMem, // if >u, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmova\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_A, EFLAGS))]>,
+ TB, OpSize;
+def CMOVA32rm : I<0x47, MRMSrcMem, // if >u, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmova\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_A, EFLAGS))]>,
+ TB;
+def CMOVL16rm : I<0x4C, MRMSrcMem, // if <s, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovl\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_L, EFLAGS))]>,
+ TB, OpSize;
+def CMOVL32rm : I<0x4C, MRMSrcMem, // if <s, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovl\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_L, EFLAGS))]>,
+ TB;
+def CMOVGE16rm: I<0x4D, MRMSrcMem, // if >=s, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovge\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_GE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVGE32rm: I<0x4D, MRMSrcMem, // if >=s, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovge\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_GE, EFLAGS))]>,
+ TB;
+def CMOVLE16rm: I<0x4E, MRMSrcMem, // if <=s, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovle\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_LE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVLE32rm: I<0x4E, MRMSrcMem, // if <=s, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovle\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_LE, EFLAGS))]>,
+ TB;
+def CMOVG16rm : I<0x4F, MRMSrcMem, // if >s, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovg\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_G, EFLAGS))]>,
+ TB, OpSize;
+def CMOVG32rm : I<0x4F, MRMSrcMem, // if >s, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovg\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_G, EFLAGS))]>,
+ TB;
+def CMOVS16rm : I<0x48, MRMSrcMem, // if signed, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovs\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_S, EFLAGS))]>,
+ TB, OpSize;
+def CMOVS32rm : I<0x48, MRMSrcMem, // if signed, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovs\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_S, EFLAGS))]>,
+ TB;
+def CMOVNS16rm: I<0x49, MRMSrcMem, // if !signed, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovns\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NS, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNS32rm: I<0x49, MRMSrcMem, // if !signed, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovns\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NS, EFLAGS))]>,
+ TB;
+def CMOVP16rm : I<0x4A, MRMSrcMem, // if parity, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovp\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_P, EFLAGS))]>,
+ TB, OpSize;
+def CMOVP32rm : I<0x4A, MRMSrcMem, // if parity, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovp\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_P, EFLAGS))]>,
+ TB;
+def CMOVNP16rm : I<0x4B, MRMSrcMem, // if !parity, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovnp\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NP, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNP32rm : I<0x4B, MRMSrcMem, // if !parity, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovnp\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NP, EFLAGS))]>,
+ TB;
+def CMOVO16rm : I<0x40, MRMSrcMem, // if overflow, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovo\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_O, EFLAGS))]>,
+ TB, OpSize;
+def CMOVO32rm : I<0x40, MRMSrcMem, // if overflow, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovo\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_O, EFLAGS))]>,
+ TB;
+def CMOVNO16rm : I<0x41, MRMSrcMem, // if !overflow, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovno\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NO, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNO32rm : I<0x41, MRMSrcMem, // if !overflow, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovno\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NO, EFLAGS))]>,
+ TB;
+} // Uses = [EFLAGS]
+
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src), "neg{b}\t$dst",
+ [(set GR8:$dst, (ineg GR8:$src)),
+ (implicit EFLAGS)]>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src), "neg{w}\t$dst",
+ [(set GR16:$dst, (ineg GR16:$src)),
+ (implicit EFLAGS)]>, OpSize;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src), "neg{l}\t$dst",
+ [(set GR32:$dst, (ineg GR32:$src)),
+ (implicit EFLAGS)]>;
+let isTwoAddress = 0 in {
+ def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), "neg{b}\t$dst",
+ [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>;
+ def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), "neg{w}\t$dst",
+ [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), "neg{l}\t$dst",
+ [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>;
+}
+} // Defs = [EFLAGS]
+
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src), "not{b}\t$dst",
+ [(set GR8:$dst, (not GR8:$src))]>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src), "not{w}\t$dst",
+ [(set GR16:$dst, (not GR16:$src))]>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src), "not{l}\t$dst",
+ [(set GR32:$dst, (not GR32:$src))]>;
+}
+let isTwoAddress = 0 in {
+ def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), "not{b}\t$dst",
+ [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
+ def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), "not{w}\t$dst",
+ [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+ def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), "not{l}\t$dst",
+ [(store (not (loadi32 addr:$dst)), addr:$dst)]>;
+}
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let CodeSize = 2 in
+def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src), "inc{b}\t$dst",
+ [(set GR8:$dst, (add GR8:$src, 1)),
+ (implicit EFLAGS)]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), "inc{w}\t$dst",
+ [(set GR16:$dst, (add GR16:$src, 1)),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In32BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), "inc{l}\t$dst",
+ [(set GR32:$dst, (add GR32:$src, 1)),
+ (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+}
+let isTwoAddress = 0, CodeSize = 2 in {
+ def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>;
+ def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In32BitMode]>;
+ def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>,
+ Requires<[In32BitMode]>;
+}
+
+let CodeSize = 2 in
+def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src), "dec{b}\t$dst",
+ [(set GR8:$dst, (add GR8:$src, -1)),
+ (implicit EFLAGS)]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), "dec{w}\t$dst",
+ [(set GR16:$dst, (add GR16:$src, -1)),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In32BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), "dec{l}\t$dst",
+ [(set GR32:$dst, (add GR32:$src, -1)),
+ (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+}
+
+let isTwoAddress = 0, CodeSize = 2 in {
+ def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>;
+ def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In32BitMode]>;
+ def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>,
+ Requires<[In32BitMode]>;
+}
+} // Defs = [EFLAGS]
+
+// Logical operators...
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in { // X = AND Y, Z --> X = AND Z, Y
+def AND8rr : I<0x20, MRMDestReg,
+ (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+ "and{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (and GR8:$src1, GR8:$src2)),
+ (implicit EFLAGS)]>;
+def AND16rr : I<0x21, MRMDestReg,
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "and{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (and GR16:$src1, GR16:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def AND32rr : I<0x21, MRMDestReg,
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "and{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (and GR32:$src1, GR32:$src2)),
+ (implicit EFLAGS)]>;
+}
+
+def AND8rm : I<0x22, MRMSrcMem,
+ (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2),
+ "and{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (and GR8:$src1, (loadi8 addr:$src2))),
+ (implicit EFLAGS)]>;
+def AND16rm : I<0x23, MRMSrcMem,
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "and{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (and GR16:$src1, (loadi16 addr:$src2))),
+ (implicit EFLAGS)]>, OpSize;
+def AND32rm : I<0x23, MRMSrcMem,
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "and{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (and GR32:$src1, (loadi32 addr:$src2))),
+ (implicit EFLAGS)]>;
+
+def AND8ri : Ii8<0x80, MRM4r,
+ (outs GR8 :$dst), (ins GR8 :$src1, i8imm :$src2),
+ "and{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (and GR8:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+def AND16ri : Ii16<0x81, MRM4r,
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "and{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (and GR16:$src1, imm:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def AND32ri : Ii32<0x81, MRM4r,
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "and{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (and GR32:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+def AND16ri8 : Ii8<0x83, MRM4r,
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "and{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2)),
+ (implicit EFLAGS)]>,
+ OpSize;
+def AND32ri8 : Ii8<0x83, MRM4r,
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "and{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 0 in {
+ def AND8mr : I<0x20, MRMDestMem,
+ (outs), (ins i8mem :$dst, GR8 :$src),
+ "and{b}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def AND16mr : I<0x21, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src),
+ "and{w}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def AND32mr : I<0x21, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src),
+ "and{l}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def AND8mi : Ii8<0x80, MRM4m,
+ (outs), (ins i8mem :$dst, i8imm :$src),
+ "and{b}\t{$src, $dst|$dst, $src}",
+ [(store (and (loadi8 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def AND16mi : Ii16<0x81, MRM4m,
+ (outs), (ins i16mem:$dst, i16imm:$src),
+ "and{w}\t{$src, $dst|$dst, $src}",
+ [(store (and (loadi16 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def AND32mi : Ii32<0x81, MRM4m,
+ (outs), (ins i32mem:$dst, i32imm:$src),
+ "and{l}\t{$src, $dst|$dst, $src}",
+ [(store (and (loadi32 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def AND16mi8 : Ii8<0x83, MRM4m,
+ (outs), (ins i16mem:$dst, i16i8imm :$src),
+ "and{w}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), i16immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def AND32mi8 : Ii8<0x83, MRM4m,
+ (outs), (ins i32mem:$dst, i32i8imm :$src),
+ "and{l}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), i32immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+}
+
+
+let isCommutable = 1 in { // X = OR Y, Z --> X = OR Z, Y
+def OR8rr : I<0x08, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+ "or{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (or GR8:$src1, GR8:$src2)),
+ (implicit EFLAGS)]>;
+def OR16rr : I<0x09, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "or{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (or GR16:$src1, GR16:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def OR32rr : I<0x09, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "or{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (or GR32:$src1, GR32:$src2)),
+ (implicit EFLAGS)]>;
+}
+def OR8rm : I<0x0A, MRMSrcMem , (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2),
+ "or{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (or GR8:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+def OR16rm : I<0x0B, MRMSrcMem , (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "or{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (or GR16:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>, OpSize;
+def OR32rm : I<0x0B, MRMSrcMem , (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "or{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (or GR32:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+
+def OR8ri : Ii8 <0x80, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "or{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (or GR8:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+def OR16ri : Ii16<0x81, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "or{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (or GR16:$src1, imm:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def OR32ri : Ii32<0x81, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "or{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (or GR32:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+
+def OR16ri8 : Ii8<0x83, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "or{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def OR32ri8 : Ii8<0x83, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "or{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+let isTwoAddress = 0 in {
+ def OR8mr : I<0x08, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "or{b}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def OR16mr : I<0x09, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "or{w}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def OR32mr : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "or{l}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def OR8mi : Ii8<0x80, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "or{b}\t{$src, $dst|$dst, $src}",
+ [(store (or (loadi8 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def OR16mi : Ii16<0x81, MRM1m, (outs), (ins i16mem:$dst, i16imm:$src),
+ "or{w}\t{$src, $dst|$dst, $src}",
+ [(store (or (loadi16 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def OR32mi : Ii32<0x81, MRM1m, (outs), (ins i32mem:$dst, i32imm:$src),
+ "or{l}\t{$src, $dst|$dst, $src}",
+ [(store (or (loadi32 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def OR16mi8 : Ii8<0x83, MRM1m, (outs), (ins i16mem:$dst, i16i8imm:$src),
+ "or{w}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), i16immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def OR32mi8 : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$src),
+ "or{l}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+} // isTwoAddress = 0
+
+
+let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y
+ def XOR8rr : I<0x30, MRMDestReg,
+ (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (xor GR8:$src1, GR8:$src2)),
+ (implicit EFLAGS)]>;
+ def XOR16rr : I<0x31, MRMDestReg,
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "xor{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (xor GR16:$src1, GR16:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+ def XOR32rr : I<0x31, MRMDestReg,
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "xor{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (xor GR32:$src1, GR32:$src2)),
+ (implicit EFLAGS)]>;
+} // isCommutable = 1
+
+def XOR8rm : I<0x32, MRMSrcMem ,
+ (outs GR8 :$dst), (ins GR8:$src1, i8mem :$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+def XOR16rm : I<0x33, MRMSrcMem ,
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "xor{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>,
+ OpSize;
+def XOR32rm : I<0x33, MRMSrcMem ,
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "xor{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+
+def XOR8ri : Ii8<0x80, MRM6r,
+ (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (xor GR8:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+def XOR16ri : Ii16<0x81, MRM6r,
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "xor{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (xor GR16:$src1, imm:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def XOR32ri : Ii32<0x81, MRM6r,
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "xor{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (xor GR32:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+def XOR16ri8 : Ii8<0x83, MRM6r,
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "xor{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2)),
+ (implicit EFLAGS)]>,
+ OpSize;
+def XOR32ri8 : Ii8<0x83, MRM6r,
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "xor{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 0 in {
+ def XOR8mr : I<0x30, MRMDestMem,
+ (outs), (ins i8mem :$dst, GR8 :$src),
+ "xor{b}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def XOR16mr : I<0x31, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src),
+ "xor{w}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def XOR32mr : I<0x31, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src),
+ "xor{l}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def XOR8mi : Ii8<0x80, MRM6m,
+ (outs), (ins i8mem :$dst, i8imm :$src),
+ "xor{b}\t{$src, $dst|$dst, $src}",
+ [(store (xor (loadi8 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def XOR16mi : Ii16<0x81, MRM6m,
+ (outs), (ins i16mem:$dst, i16imm:$src),
+ "xor{w}\t{$src, $dst|$dst, $src}",
+ [(store (xor (loadi16 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def XOR32mi : Ii32<0x81, MRM6m,
+ (outs), (ins i32mem:$dst, i32imm:$src),
+ "xor{l}\t{$src, $dst|$dst, $src}",
+ [(store (xor (loadi32 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def XOR16mi8 : Ii8<0x83, MRM6m,
+ (outs), (ins i16mem:$dst, i16i8imm :$src),
+ "xor{w}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), i16immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def XOR32mi8 : Ii8<0x83, MRM6m,
+ (outs), (ins i32mem:$dst, i32i8imm :$src),
+ "xor{l}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+} // isTwoAddress = 0
+} // Defs = [EFLAGS]
+
+// Shift instructions
+let Defs = [EFLAGS] in {
+let Uses = [CL] in {
+def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src),
+ "shl{b}\t{%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (shl GR8:$src, CL))]>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src),
+ "shl{w}\t{%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (shl GR16:$src, CL))]>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src),
+ "shl{l}\t{%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (shl GR32:$src, CL))]>;
+} // Uses = [CL]
+
+def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "shl{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "shl{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "shl{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
+// NOTE: We don't use shifts of a register by one, because 'add reg,reg' is
+// cheaper.
+} // isConvertibleToThreeAddress = 1
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t{%cl, $dst|$dst, %CL}",
+ [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t{%cl, $dst|$dst, %CL}",
+ [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t{%cl, $dst|$dst, %CL}",
+ [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "shl{b}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "shl{w}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "shl{l}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Shift by 1
+ def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t$dst",
+ [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t$dst",
+ [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t$dst",
+ [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src),
+ "shr{b}\t{%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (srl GR8:$src, CL))]>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src),
+ "shr{w}\t{%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (srl GR16:$src, CL))]>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src),
+ "shr{l}\t{%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (srl GR32:$src, CL))]>;
+}
+
+def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "shr{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
+def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "shr{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "shr{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+ "shr{b}\t$dst",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
+def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t$dst",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize;
+def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t$dst",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t{%cl, $dst|$dst, %CL}",
+ [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t{%cl, $dst|$dst, %CL}",
+ [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
+ OpSize;
+ def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t{%cl, $dst|$dst, %CL}",
+ [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "shr{b}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "shr{w}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "shr{l}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Shift by 1
+ def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t$dst",
+ [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t$dst",
+ [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize;
+ def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t$dst",
+ [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src),
+ "sar{b}\t{%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (sra GR8:$src, CL))]>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src),
+ "sar{w}\t{%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (sra GR16:$src, CL))]>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src),
+ "sar{l}\t{%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (sra GR32:$src, CL))]>;
+}
+
+def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "sar{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
+def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "sar{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize;
+def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "sar{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t$dst",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
+def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t$dst",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize;
+def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t$dst",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t{%cl, $dst|$dst, %CL}",
+ [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t{%cl, $dst|$dst, %CL}",
+ [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t{%cl, $dst|$dst, %CL}",
+ [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "sar{b}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "sar{w}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "sar{l}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Shift by 1
+ def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t$dst",
+ [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t$dst",
+ [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t$dst",
+ [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+// Rotate instructions
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src),
+ "rol{b}\t{%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (rotl GR8:$src, CL))]>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src),
+ "rol{w}\t{%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (rotl GR16:$src, CL))]>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src),
+ "rol{l}\t{%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (rotl GR32:$src, CL))]>;
+}
+
+def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "rol{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "rol{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "rol{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t$dst",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
+def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t$dst",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize;
+def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t$dst",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "rol{b}\t{$src, $dst|$dst, $src}",
+ [(store (rotl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "rol{w}\t{$src, $dst|$dst, $src}",
+ [(store (rotl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "rol{l}\t{$src, $dst|$dst, $src}",
+ [(store (rotl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Rotate by 1
+ def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t$dst",
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t$dst",
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t$dst",
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src),
+ "ror{b}\t{%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (rotr GR8:$src, CL))]>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src),
+ "ror{w}\t{%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (rotr GR16:$src, CL))]>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src),
+ "ror{l}\t{%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (rotr GR32:$src, CL))]>;
+}
+
+def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "ror{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
+def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "ror{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "ror{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t$dst",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
+def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t$dst",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize;
+def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t$dst",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "ror{b}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "ror{w}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "ror{l}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Rotate by 1
+ def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t$dst",
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t$dst",
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t$dst",
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+
+
+// Double shift instructions (generalizations of rotate)
+let Uses = [CL] in {
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, TB;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, TB;
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+ TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+ TB, OpSize;
+}
+
+let isCommutable = 1 in { // These instructions commute to each other.
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))]>,
+ TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))]>,
+ TB, OpSize;
+}
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)]>, TB;
+ def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)]>, TB;
+ }
+ def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+ def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+
+ let Uses = [CL] in {
+ def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)]>, TB, OpSize;
+ def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)]>, TB, OpSize;
+ }
+ def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize;
+ def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize;
+}
+} // Defs = [EFLAGS]
+
+
+// Arithmetic.
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in { // X = ADD Y, Z --> X = ADD Z, Y
+// Register-Register Addition
+def ADD8rr : I<0x00, MRMDestReg, (outs GR8 :$dst),
+ (ins GR8 :$src1, GR8 :$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (add GR8:$src1, GR8:$src2)),
+ (implicit EFLAGS)]>;
+
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+// Register-Register Addition
+def ADD16rr : I<0x01, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (add GR16:$src1, GR16:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def ADD32rr : I<0x01, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (add GR32:$src1, GR32:$src2)),
+ (implicit EFLAGS)]>;
+} // end isConvertibleToThreeAddress
+} // end isCommutable
+
+// Register-Memory Addition
+def ADD8rm : I<0x02, MRMSrcMem, (outs GR8 :$dst),
+ (ins GR8 :$src1, i8mem :$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (add GR8:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+def ADD16rm : I<0x03, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (add GR16:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>, OpSize;
+def ADD32rm : I<0x03, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (add GR32:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+
+// Register-Integer Addition
+def ADD8ri : Ii8<0x80, MRM0r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (add GR8:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+// Register-Integer Addition
+def ADD16ri : Ii16<0x81, MRM0r, (outs GR16:$dst),
+ (ins GR16:$src1, i16imm:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (add GR16:$src1, imm:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def ADD32ri : Ii32<0x81, MRM0r, (outs GR32:$dst),
+ (ins GR32:$src1, i32imm:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (add GR32:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+def ADD16ri8 : Ii8<0x83, MRM0r, (outs GR16:$dst),
+ (ins GR16:$src1, i16i8imm:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def ADD32ri8 : Ii8<0x83, MRM0r, (outs GR32:$dst),
+ (ins GR32:$src1, i32i8imm:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+}
+
+let isTwoAddress = 0 in {
+ // Memory-Register Addition
+ def ADD8mr : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR8:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def ADD16mr : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR16:$src2), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def ADD32mr : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR32:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def ADD8mi : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (loadi8 addr:$dst), imm:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def ADD16mi : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (loadi16 addr:$dst), imm:$src2), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def ADD32mi : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (loadi32 addr:$dst), imm:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i16immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i32immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>;
+}
+
+let Uses = [EFLAGS] in {
+let isCommutable = 1 in { // X = ADC Y, Z --> X = ADC Z, Y
+def ADC8rr : I<0x10, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (adde GR8:$src1, GR8:$src2))]>;
+def ADC16rr : I<0x11, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (adde GR16:$src1, GR16:$src2))]>, OpSize;
+def ADC32rr : I<0x11, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, GR32:$src2))]>;
+}
+def ADC8rm : I<0x12, MRMSrcMem , (outs GR8:$dst),
+ (ins GR8:$src1, i8mem:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (adde GR8:$src1, (load addr:$src2)))]>;
+def ADC16rm : I<0x13, MRMSrcMem , (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (adde GR16:$src1, (load addr:$src2)))]>,
+ OpSize;
+def ADC32rm : I<0x13, MRMSrcMem , (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, (load addr:$src2)))]>;
+def ADC8ri : Ii8<0x80, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (adde GR8:$src1, imm:$src2))]>;
+def ADC16ri : Ii16<0x81, MRM2r, (outs GR16:$dst),
+ (ins GR16:$src1, i16imm:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (adde GR16:$src1, imm:$src2))]>, OpSize;
+def ADC16ri8 : Ii8<0x83, MRM2r, (outs GR16:$dst),
+ (ins GR16:$src1, i16i8imm:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (adde GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def ADC32ri : Ii32<0x81, MRM2r, (outs GR32:$dst),
+ (ins GR32:$src1, i32imm:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, imm:$src2))]>;
+def ADC32ri8 : Ii8<0x83, MRM2r, (outs GR32:$dst),
+ (ins GR32:$src1, i32i8imm:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+ def ADC8mr : I<0x10, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR8:$src2), addr:$dst)]>;
+ def ADC16mr : I<0x11, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR16:$src2), addr:$dst)]>,
+ OpSize;
+ def ADC32mr : I<0x11, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR32:$src2), addr:$dst)]>;
+ def ADC8mi : Ii8<0x80, MRM2m, (outs), (ins i8mem:$dst, i8imm:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+ def ADC16mi : Ii16<0x81, MRM2m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+ OpSize;
+ def ADC16mi8 : Ii8<0x83, MRM2m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+ OpSize;
+ def ADC32mi : Ii32<0x81, MRM2m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+ def ADC32mi8 : Ii8<0x83, MRM2m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+} // Uses = [EFLAGS]
+
+// Register-Register Subtraction
+def SUB8rr : I<0x28, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sub GR8:$src1, GR8:$src2)),
+ (implicit EFLAGS)]>;
+def SUB16rr : I<0x29, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sub GR16:$src1, GR16:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def SUB32rr : I<0x29, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sub GR32:$src1, GR32:$src2)),
+ (implicit EFLAGS)]>;
+
+// Register-Memory Subtraction
+def SUB8rm : I<0x2A, MRMSrcMem, (outs GR8 :$dst),
+ (ins GR8 :$src1, i8mem :$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+def SUB16rm : I<0x2B, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>, OpSize;
+def SUB32rm : I<0x2B, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>;
+
+// Register-Integer Subtraction
+def SUB8ri : Ii8 <0x80, MRM5r, (outs GR8:$dst),
+ (ins GR8:$src1, i8imm:$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sub GR8:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+def SUB16ri : Ii16<0x81, MRM5r, (outs GR16:$dst),
+ (ins GR16:$src1, i16imm:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sub GR16:$src1, imm:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def SUB32ri : Ii32<0x81, MRM5r, (outs GR32:$dst),
+ (ins GR32:$src1, i32imm:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sub GR32:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+def SUB16ri8 : Ii8<0x83, MRM5r, (outs GR16:$dst),
+ (ins GR16:$src1, i16i8imm:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def SUB32ri8 : Ii8<0x83, MRM5r, (outs GR32:$dst),
+ (ins GR32:$src1, i32i8imm:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 0 in {
+ // Memory-Register Subtraction
+ def SUB8mr : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR8:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def SUB16mr : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR16:$src2), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def SUB32mr : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR32:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+
+ // Memory-Integer Subtraction
+ def SUB8mi : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (loadi8 addr:$dst), imm:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def SUB16mi : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (loadi16 addr:$dst), imm:$src2),addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def SUB32mi : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (loadi32 addr:$dst), imm:$src2),addr:$dst),
+ (implicit EFLAGS)]>;
+ def SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i16immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i32immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>;
+}
+
+let Uses = [EFLAGS] in {
+def SBB8rr : I<0x18, MRMDestReg, (outs GR8:$dst),
+ (ins GR8:$src1, GR8:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sube GR8:$src1, GR8:$src2))]>;
+def SBB16rr : I<0x19, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sube GR16:$src1, GR16:$src2))]>, OpSize;
+def SBB32rr : I<0x19, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, GR32:$src2))]>;
+
+let isTwoAddress = 0 in {
+ def SBB8mr : I<0x18, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR8:$src2), addr:$dst)]>;
+ def SBB16mr : I<0x19, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR16:$src2), addr:$dst)]>,
+ OpSize;
+ def SBB32mr : I<0x19, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR32:$src2), addr:$dst)]>;
+ def SBB8mi : Ii32<0x80, MRM3m, (outs), (ins i8mem:$dst, i8imm:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+ def SBB16mi : Ii16<0x81, MRM3m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+ OpSize;
+ def SBB16mi8 : Ii8<0x83, MRM3m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+ OpSize;
+ def SBB32mi : Ii32<0x81, MRM3m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+ def SBB32mi8 : Ii8<0x83, MRM3m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+def SBB8rm : I<0x1A, MRMSrcMem, (outs GR8:$dst), (ins GR8:$src1, i8mem:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sube GR8:$src1, (load addr:$src2)))]>;
+def SBB16rm : I<0x1B, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sube GR16:$src1, (load addr:$src2)))]>,
+ OpSize;
+def SBB32rm : I<0x1B, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, (load addr:$src2)))]>;
+def SBB8ri : Ii8<0x80, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sube GR8:$src1, imm:$src2))]>;
+def SBB16ri : Ii16<0x81, MRM3r, (outs GR16:$dst),
+ (ins GR16:$src1, i16imm:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sube GR16:$src1, imm:$src2))]>, OpSize;
+def SBB16ri8 : Ii8<0x83, MRM3r, (outs GR16:$dst),
+ (ins GR16:$src1, i16i8imm:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sube GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def SBB32ri : Ii32<0x81, MRM3r, (outs GR32:$dst),
+ (ins GR32:$src1, i32imm:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, imm:$src2))]>;
+def SBB32ri8 : Ii8<0x83, MRM3r, (outs GR32:$dst),
+ (ins GR32:$src1, i32i8imm:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, i32immSExt8:$src2))]>;
+} // Uses = [EFLAGS]
+} // Defs = [EFLAGS]
+
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in { // X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (mul GR16:$src1, GR16:$src2)),
+ (implicit EFLAGS)]>, TB, OpSize;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (mul GR32:$src1, GR32:$src2)),
+ (implicit EFLAGS)]>, TB;
+}
+
+// Register-Memory Signed Integer Multiply
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>, TB, OpSize;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2))),
+ (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+} // end Two Address instructions
+
+// Suprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, (mul GR16:$src1, imm:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (mul GR32:$src1, imm:$src2)),
+ (implicit EFLAGS)]>;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+
+// Memory-Integer Signed Integer Multiply
+def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
+ (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, (mul (load addr:$src1), imm:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
+ (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (mul (load addr:$src1), imm:$src2)),
+ (implicit EFLAGS)]>;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
+ (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, (mul (load addr:$src1),
+ i16immSExt8:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
+ (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (mul (load addr:$src1),
+ i32immSExt8:$src2)),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Test instructions are just like AND, except they don't generate a result.
+//
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in { // TEST X, Y --> TEST Y, X
+def TEST8rr : I<0x84, MRMDestReg, (outs), (ins GR8:$src1, GR8:$src2),
+ "test{b}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and_su GR8:$src1, GR8:$src2), 0),
+ (implicit EFLAGS)]>;
+def TEST16rr : I<0x85, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "test{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and_su GR16:$src1, GR16:$src2), 0),
+ (implicit EFLAGS)]>,
+ OpSize;
+def TEST32rr : I<0x85, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "test{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and_su GR32:$src1, GR32:$src2), 0),
+ (implicit EFLAGS)]>;
+}
+
+def TEST8rm : I<0x84, MRMSrcMem, (outs), (ins GR8 :$src1, i8mem :$src2),
+ "test{b}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR8:$src1, (loadi8 addr:$src2)), 0),
+ (implicit EFLAGS)]>;
+def TEST16rm : I<0x85, MRMSrcMem, (outs), (ins GR16:$src1, i16mem:$src2),
+ "test{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR16:$src1, (loadi16 addr:$src2)), 0),
+ (implicit EFLAGS)]>, OpSize;
+def TEST32rm : I<0x85, MRMSrcMem, (outs), (ins GR32:$src1, i32mem:$src2),
+ "test{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR32:$src1, (loadi32 addr:$src2)), 0),
+ (implicit EFLAGS)]>;
+
+def TEST8ri : Ii8 <0xF6, MRM0r, // flags = GR8 & imm8
+ (outs), (ins GR8:$src1, i8imm:$src2),
+ "test{b}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and_su GR8:$src1, imm:$src2), 0),
+ (implicit EFLAGS)]>;
+def TEST16ri : Ii16<0xF7, MRM0r, // flags = GR16 & imm16
+ (outs), (ins GR16:$src1, i16imm:$src2),
+ "test{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and_su GR16:$src1, imm:$src2), 0),
+ (implicit EFLAGS)]>, OpSize;
+def TEST32ri : Ii32<0xF7, MRM0r, // flags = GR32 & imm32
+ (outs), (ins GR32:$src1, i32imm:$src2),
+ "test{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and_su GR32:$src1, imm:$src2), 0),
+ (implicit EFLAGS)]>;
+
+def TEST8mi : Ii8 <0xF6, MRM0m, // flags = [mem8] & imm8
+ (outs), (ins i8mem:$src1, i8imm:$src2),
+ "test{b}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and (loadi8 addr:$src1), imm:$src2), 0),
+ (implicit EFLAGS)]>;
+def TEST16mi : Ii16<0xF7, MRM0m, // flags = [mem16] & imm16
+ (outs), (ins i16mem:$src1, i16imm:$src2),
+ "test{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and (loadi16 addr:$src1), imm:$src2), 0),
+ (implicit EFLAGS)]>, OpSize;
+def TEST32mi : Ii32<0xF7, MRM0m, // flags = [mem32] & imm32
+ (outs), (ins i32mem:$src1, i32imm:$src2),
+ "test{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (and (loadi32 addr:$src1), imm:$src2), 0),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let Defs = [EFLAGS], Uses = [AH], neverHasSideEffects = 1 in
+def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", []>; // flags = AH
+let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
+def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>; // AH = flags
+
+let Uses = [EFLAGS] in {
+def SETEr : I<0x94, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "sete\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_E, EFLAGS))]>,
+ TB; // GR8 = ==
+def SETEm : I<0x94, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "sete\t$dst",
+ [(store (X86setcc X86_COND_E, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = ==
+
+def SETNEr : I<0x95, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setne\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NE, EFLAGS))]>,
+ TB; // GR8 = !=
+def SETNEm : I<0x95, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setne\t$dst",
+ [(store (X86setcc X86_COND_NE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = !=
+
+def SETLr : I<0x9C, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setl\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_L, EFLAGS))]>,
+ TB; // GR8 = < signed
+def SETLm : I<0x9C, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setl\t$dst",
+ [(store (X86setcc X86_COND_L, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = < signed
+
+def SETGEr : I<0x9D, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setge\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_GE, EFLAGS))]>,
+ TB; // GR8 = >= signed
+def SETGEm : I<0x9D, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setge\t$dst",
+ [(store (X86setcc X86_COND_GE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = >= signed
+
+def SETLEr : I<0x9E, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setle\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_LE, EFLAGS))]>,
+ TB; // GR8 = <= signed
+def SETLEm : I<0x9E, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setle\t$dst",
+ [(store (X86setcc X86_COND_LE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = <= signed
+
+def SETGr : I<0x9F, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setg\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_G, EFLAGS))]>,
+ TB; // GR8 = > signed
+def SETGm : I<0x9F, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setg\t$dst",
+ [(store (X86setcc X86_COND_G, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = > signed
+
+def SETBr : I<0x92, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setb\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_B, EFLAGS))]>,
+ TB; // GR8 = < unsign
+def SETBm : I<0x92, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setb\t$dst",
+ [(store (X86setcc X86_COND_B, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = < unsign
+
+def SETAEr : I<0x93, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setae\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_AE, EFLAGS))]>,
+ TB; // GR8 = >= unsign
+def SETAEm : I<0x93, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setae\t$dst",
+ [(store (X86setcc X86_COND_AE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = >= unsign
+
+def SETBEr : I<0x96, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setbe\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_BE, EFLAGS))]>,
+ TB; // GR8 = <= unsign
+def SETBEm : I<0x96, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setbe\t$dst",
+ [(store (X86setcc X86_COND_BE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = <= unsign
+
+def SETAr : I<0x97, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "seta\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_A, EFLAGS))]>,
+ TB; // GR8 = > signed
+def SETAm : I<0x97, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "seta\t$dst",
+ [(store (X86setcc X86_COND_A, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = > signed
+
+def SETSr : I<0x98, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "sets\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_S, EFLAGS))]>,
+ TB; // GR8 = <sign bit>
+def SETSm : I<0x98, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "sets\t$dst",
+ [(store (X86setcc X86_COND_S, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = <sign bit>
+def SETNSr : I<0x99, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setns\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NS, EFLAGS))]>,
+ TB; // GR8 = !<sign bit>
+def SETNSm : I<0x99, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setns\t$dst",
+ [(store (X86setcc X86_COND_NS, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = !<sign bit>
+
+def SETPr : I<0x9A, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setp\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_P, EFLAGS))]>,
+ TB; // GR8 = parity
+def SETPm : I<0x9A, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setp\t$dst",
+ [(store (X86setcc X86_COND_P, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = parity
+def SETNPr : I<0x9B, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setnp\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NP, EFLAGS))]>,
+ TB; // GR8 = not parity
+def SETNPm : I<0x9B, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setnp\t$dst",
+ [(store (X86setcc X86_COND_NP, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = not parity
+
+def SETOr : I<0x90, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "seto\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_O, EFLAGS))]>,
+ TB; // GR8 = overflow
+def SETOm : I<0x90, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "seto\t$dst",
+ [(store (X86setcc X86_COND_O, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = overflow
+def SETNOr : I<0x91, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setno\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NO, EFLAGS))]>,
+ TB; // GR8 = not overflow
+def SETNOm : I<0x91, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setno\t$dst",
+ [(store (X86setcc X86_COND_NO, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = not overflow
+} // Uses = [EFLAGS]
+
+
+// Integer comparisons
+let Defs = [EFLAGS] in {
+def CMP8rr : I<0x38, MRMDestReg,
+ (outs), (ins GR8 :$src1, GR8 :$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR8:$src1, GR8:$src2), (implicit EFLAGS)]>;
+def CMP16rr : I<0x39, MRMDestReg,
+ (outs), (ins GR16:$src1, GR16:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR16:$src1, GR16:$src2), (implicit EFLAGS)]>, OpSize;
+def CMP32rr : I<0x39, MRMDestReg,
+ (outs), (ins GR32:$src1, GR32:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR32:$src1, GR32:$src2), (implicit EFLAGS)]>;
+def CMP8mr : I<0x38, MRMDestMem,
+ (outs), (ins i8mem :$src1, GR8 :$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi8 addr:$src1), GR8:$src2),
+ (implicit EFLAGS)]>;
+def CMP16mr : I<0x39, MRMDestMem,
+ (outs), (ins i16mem:$src1, GR16:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi16 addr:$src1), GR16:$src2),
+ (implicit EFLAGS)]>, OpSize;
+def CMP32mr : I<0x39, MRMDestMem,
+ (outs), (ins i32mem:$src1, GR32:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi32 addr:$src1), GR32:$src2),
+ (implicit EFLAGS)]>;
+def CMP8rm : I<0x3A, MRMSrcMem,
+ (outs), (ins GR8 :$src1, i8mem :$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR8:$src1, (loadi8 addr:$src2)),
+ (implicit EFLAGS)]>;
+def CMP16rm : I<0x3B, MRMSrcMem,
+ (outs), (ins GR16:$src1, i16mem:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR16:$src1, (loadi16 addr:$src2)),
+ (implicit EFLAGS)]>, OpSize;
+def CMP32rm : I<0x3B, MRMSrcMem,
+ (outs), (ins GR32:$src1, i32mem:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR32:$src1, (loadi32 addr:$src2)),
+ (implicit EFLAGS)]>;
+def CMP8ri : Ii8<0x80, MRM7r,
+ (outs), (ins GR8:$src1, i8imm:$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR8:$src1, imm:$src2), (implicit EFLAGS)]>;
+def CMP16ri : Ii16<0x81, MRM7r,
+ (outs), (ins GR16:$src1, i16imm:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR16:$src1, imm:$src2),
+ (implicit EFLAGS)]>, OpSize;
+def CMP32ri : Ii32<0x81, MRM7r,
+ (outs), (ins GR32:$src1, i32imm:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR32:$src1, imm:$src2), (implicit EFLAGS)]>;
+def CMP8mi : Ii8 <0x80, MRM7m,
+ (outs), (ins i8mem :$src1, i8imm :$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi8 addr:$src1), imm:$src2),
+ (implicit EFLAGS)]>;
+def CMP16mi : Ii16<0x81, MRM7m,
+ (outs), (ins i16mem:$src1, i16imm:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi16 addr:$src1), imm:$src2),
+ (implicit EFLAGS)]>, OpSize;
+def CMP32mi : Ii32<0x81, MRM7m,
+ (outs), (ins i32mem:$src1, i32imm:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi32 addr:$src1), imm:$src2),
+ (implicit EFLAGS)]>;
+def CMP16ri8 : Ii8<0x83, MRM7r,
+ (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR16:$src1, i16immSExt8:$src2),
+ (implicit EFLAGS)]>, OpSize;
+def CMP16mi8 : Ii8<0x83, MRM7m,
+ (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi16 addr:$src1), i16immSExt8:$src2),
+ (implicit EFLAGS)]>, OpSize;
+def CMP32mi8 : Ii8<0x83, MRM7m,
+ (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi32 addr:$src1), i32immSExt8:$src2),
+ (implicit EFLAGS)]>;
+def CMP32ri8 : Ii8<0x83, MRM7r,
+ (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp GR32:$src1, i32immSExt8:$src2),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Bit tests.
+// TODO: BTC, BTR, and BTS
+let Defs = [EFLAGS] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86bt GR16:$src1, GR16:$src2),
+ (implicit EFLAGS)]>, OpSize, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86bt GR32:$src1, GR32:$src2),
+ (implicit EFLAGS)]>, TB;
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Disable these instructions for now.
+//def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+// "bt{w}\t{$src2, $src1|$src1, $src2}",
+// [(X86bt (loadi16 addr:$src1), GR16:$src2),
+// (implicit EFLAGS)]>, OpSize, TB, Requires<[FastBTMem]>;
+//def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+// "bt{l}\t{$src2, $src1|$src1, $src2}",
+// [(X86bt (loadi32 addr:$src1), GR32:$src2),
+// (implicit EFLAGS)]>, TB, Requires<[FastBTMem]>;
+
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86bt GR16:$src1, i16immSExt8:$src2),
+ (implicit EFLAGS)]>, OpSize, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86bt GR32:$src1, i32immSExt8:$src2),
+ (implicit EFLAGS)]>, TB;
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(X86bt (loadi16 addr:$src1), i16immSExt8:$src2),
+ (implicit EFLAGS)]>, OpSize, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(X86bt (loadi32 addr:$src1), i32immSExt8:$src2),
+ (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+// Sign/Zero extenders
+// Use movsbl intead of movsbw; we don't care about the high 16 bits
+// of the register here. This has a smaller encoding and avoids a
+// partial-register update.
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
+ "movs{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR16:$dst, (sext GR8:$src))]>, TB;
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
+ "movs{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB;
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR8:$src))]>, TB;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR16:$src))]>, TB;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
+
+// Use movzbl intead of movzbw; we don't care about the high 16 bits
+// of the register here. This has a smaller encoding and avoids a
+// partial-register update.
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
+ "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR16:$dst, (zext GR8:$src))]>, TB;
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
+ "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB;
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR8:$src))]>, TB;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR16:$src))]>, TB;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;
+
+// These are the same as the regular regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+ (outs GR32_NOREX:$dst), (ins GR8:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ []>, TB;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+ (outs GR32_NOREX:$dst), (ins i8mem:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ []>, TB;
+
+let neverHasSideEffects = 1 in {
+ let Defs = [AX], Uses = [AL] in
+ def CBW : I<0x98, RawFrm, (outs), (ins),
+ "{cbtw|cbw}", []>, OpSize; // AX = signext(AL)
+ let Defs = [EAX], Uses = [AX] in
+ def CWDE : I<0x98, RawFrm, (outs), (ins),
+ "{cwtl|cwde}", []>; // EAX = signext(AX)
+
+ let Defs = [AX,DX], Uses = [AX] in
+ def CWD : I<0x99, RawFrm, (outs), (ins),
+ "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX)
+ let Defs = [EAX,EDX], Uses = [EAX] in
+ def CDQ : I<0x99, RawFrm, (outs), (ins),
+ "{cltd|cdq}", []>; // EDX:EAX = signext(EAX)
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins),
+ "xor{b}\t$dst, $dst",
+ [(set GR8:$dst, 0)]>;
+// Use xorl instead of xorw since we don't care about the high 16 bits,
+// it's smaller, and it avoids a partial-register update.
+def MOV16r0 : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
+ "xor{l}\t${dst:subreg32}, ${dst:subreg32}",
+ [(set GR16:$dst, 0)]>;
+def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins),
+ "xor{l}\t$dst, $dst",
+ [(set GR32:$dst, 0)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [ESP, EBX] in
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32imm:$sym),
+ "leal\t${sym:mem}(,%ebx,1), %eax; "
+ "call\t___tls_get_addr@PLT",
+ [(X86tlsaddr tglobaltlsaddr:$sym)]>,
+ Requires<[In32BitMode]>;
+
+let AddedComplexity = 5 in
+def GS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "movl\t%gs:$src, $dst",
+ [(set GR32:$dst, (gsload addr:$src))]>, SegGS;
+
+let AddedComplexity = 5 in
+def FS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "movl\t%fs:$src, $dst",
+ [(set GR32:$dst, (fsload addr:$src))]>, SegFS;
+
+//===----------------------------------------------------------------------===//
+// DWARF Pseudo Instructions
+//
+
+def DWARF_LOC : I<0, Pseudo, (outs),
+ (ins i32imm:$line, i32imm:$col, i32imm:$file),
+ ".loc\t${file:debug} ${line:debug} ${col:debug}",
+ [(dwarf_loc (i32 imm:$line), (i32 imm:$col),
+ (i32 imm:$file))]>;
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1 in {
+def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR32:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+let Constraints = "$val = $dst" in {
+def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+ "xchg{l}\t{$val, $ptr|$ptr, $val}",
+ [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))]>;
+def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
+ "xchg{w}\t{$val, $ptr|$ptr, $val}",
+ [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))]>,
+ OpSize;
+def XCHG8rm : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins i8mem:$ptr, GR8:$val),
+ "xchg{b}\t{$val, $ptr|$ptr, $val}",
+ [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))]>;
+}
+
+// Atomic compare and swap.
+let Defs = [EAX, EFLAGS], Uses = [EAX] in {
+def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
+ "lock\n\t"
+ "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}",
+ [(X86cas addr:$ptr, GR32:$swap, 4)]>, TB, LOCK;
+}
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in {
+def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i32mem:$ptr),
+ "lock\n\t"
+ "cmpxchg8b\t$ptr",
+ [(X86cas8 addr:$ptr)]>, TB, LOCK;
+}
+
+let Defs = [AX, EFLAGS], Uses = [AX] in {
+def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap),
+ "lock\n\t"
+ "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}",
+ [(X86cas addr:$ptr, GR16:$swap, 2)]>, TB, OpSize, LOCK;
+}
+let Defs = [AL, EFLAGS], Uses = [AL] in {
+def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
+ "lock\n\t"
+ "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}",
+ [(X86cas addr:$ptr, GR8:$swap, 1)]>, TB, LOCK;
+}
+
+// Atomic exchange and add
+let Constraints = "$val = $dst", Defs = [EFLAGS] in {
+def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+ "lock\n\t"
+ "xadd{l}\t{$val, $ptr|$ptr, $val}",
+ [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))]>,
+ TB, LOCK;
+def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
+ "lock\n\t"
+ "xadd{w}\t{$val, $ptr|$ptr, $val}",
+ [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))]>,
+ TB, OpSize, LOCK;
+def LXADD8 : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins i8mem:$ptr, GR8:$val),
+ "lock\n\t"
+ "xadd{b}\t{$val, $ptr|$ptr, $val}",
+ [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))]>,
+ TB, LOCK;
+}
+
+// Atomic exchange, and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+ usesCustomDAGSchedInserter = 1 in {
+def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMAND32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_and_32 addr:$ptr, GR32:$val))]>;
+def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMOR32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_or_32 addr:$ptr, GR32:$val))]>;
+def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMXOR32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_xor_32 addr:$ptr, GR32:$val))]>;
+def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMNAND32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_nand_32 addr:$ptr, GR32:$val))]>;
+def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+ "#ATOMMIN32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_min_32 addr:$ptr, GR32:$val))]>;
+def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMMAX32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_max_32 addr:$ptr, GR32:$val))]>;
+def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMUMIN32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_umin_32 addr:$ptr, GR32:$val))]>;
+def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMUMAX32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_umax_32 addr:$ptr, GR32:$val))]>;
+
+def ATOMAND16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMAND16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_and_16 addr:$ptr, GR16:$val))]>;
+def ATOMOR16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMOR16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_or_16 addr:$ptr, GR16:$val))]>;
+def ATOMXOR16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMXOR16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_xor_16 addr:$ptr, GR16:$val))]>;
+def ATOMNAND16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMNAND16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_nand_16 addr:$ptr, GR16:$val))]>;
+def ATOMMIN16: I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
+ "#ATOMMIN16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_min_16 addr:$ptr, GR16:$val))]>;
+def ATOMMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMMAX16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_max_16 addr:$ptr, GR16:$val))]>;
+def ATOMUMIN16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMUMIN16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_umin_16 addr:$ptr, GR16:$val))]>;
+def ATOMUMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMUMAX16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_umax_16 addr:$ptr, GR16:$val))]>;
+
+def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+ "#ATOMAND8 PSEUDO!",
+ [(set GR8:$dst, (atomic_load_and_8 addr:$ptr, GR8:$val))]>;
+def ATOMOR8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+ "#ATOMOR8 PSEUDO!",
+ [(set GR8:$dst, (atomic_load_or_8 addr:$ptr, GR8:$val))]>;
+def ATOMXOR8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+ "#ATOMXOR8 PSEUDO!",
+ [(set GR8:$dst, (atomic_load_xor_8 addr:$ptr, GR8:$val))]>;
+def ATOMNAND8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+ "#ATOMNAND8 PSEUDO!",
+ [(set GR8:$dst, (atomic_load_nand_8 addr:$ptr, GR8:$val))]>;
+}
+
+let Constraints = "$val1 = $dst1, $val2 = $dst2",
+ Defs = [EFLAGS, EAX, EBX, ECX, EDX],
+ Uses = [EAX, EBX, ECX, EDX],
+ mayLoad = 1, mayStore = 1,
+ usesCustomDAGSchedInserter = 1 in {
+def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMAND6432 PSEUDO!", []>;
+def ATOMOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMOR6432 PSEUDO!", []>;
+def ATOMXOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMXOR6432 PSEUDO!", []>;
+def ATOMNAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMNAND6432 PSEUDO!", []>;
+def ATOMADD6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMADD6432 PSEUDO!", []>;
+def ATOMSUB6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMSUB6432 PSEUDO!", []>;
+def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMSWAP6432 PSEUDO!", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+ (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+ (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+ (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+ (ADD32ri GR32:$src1, texternalsym:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV32mi addr:$dst, texternalsym:$src)>;
+
+// Calls
+// tailcall stuff
+def : Pat<(X86tailcall GR32:$dst),
+ (TAILCALL)>;
+
+def : Pat<(X86tailcall (i32 tglobaladdr:$dst)),
+ (TAILCALL)>;
+def : Pat<(X86tailcall (i32 texternalsym:$dst)),
+ (TAILCALL)>;
+
+def : Pat<(X86tcret GR32:$dst, imm:$off),
+ (TCRETURNri GR32:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>;
+
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+ (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+ (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+ (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR32:$src1, GR32:$src2),
+ (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(addc GR32:$src1, (load addr:$src2)),
+ (ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(addc GR32:$src1, imm:$src2),
+ (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(addc GR32:$src1, i32immSExt8:$src2),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(subc GR32:$src1, GR32:$src2),
+ (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(subc GR32:$src1, (load addr:$src2)),
+ (SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(subc GR32:$src1, imm:$src2),
+ (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(subc GR32:$src1, i32immSExt8:$src2),
+ (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(parallel (X86cmp GR8:$src1, 0), (implicit EFLAGS)),
+ (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(parallel (X86cmp GR16:$src1, 0), (implicit EFLAGS)),
+ (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(parallel (X86cmp GR32:$src1, 0), (implicit EFLAGS)),
+ (TEST32rr GR32:$src1, GR32:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_B, EFLAGS),
+ (CMOVAE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_B, EFLAGS),
+ (CMOVAE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_AE, EFLAGS),
+ (CMOVB16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_AE, EFLAGS),
+ (CMOVB32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_E, EFLAGS),
+ (CMOVNE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_E, EFLAGS),
+ (CMOVNE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NE, EFLAGS),
+ (CMOVE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NE, EFLAGS),
+ (CMOVE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_BE, EFLAGS),
+ (CMOVA16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_BE, EFLAGS),
+ (CMOVA32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_A, EFLAGS),
+ (CMOVBE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_A, EFLAGS),
+ (CMOVBE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_L, EFLAGS),
+ (CMOVGE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_L, EFLAGS),
+ (CMOVGE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_GE, EFLAGS),
+ (CMOVL16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_GE, EFLAGS),
+ (CMOVL32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_LE, EFLAGS),
+ (CMOVG16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_LE, EFLAGS),
+ (CMOVG32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_G, EFLAGS),
+ (CMOVLE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_G, EFLAGS),
+ (CMOVLE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_P, EFLAGS),
+ (CMOVNP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_P, EFLAGS),
+ (CMOVNP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NP, EFLAGS),
+ (CMOVP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NP, EFLAGS),
+ (CMOVP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_S, EFLAGS),
+ (CMOVNS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_S, EFLAGS),
+ (CMOVNS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NS, EFLAGS),
+ (CMOVS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NS, EFLAGS),
+ (CMOVS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_O, EFLAGS),
+ (CMOVNO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_O, EFLAGS),
+ (CMOVNO32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NO, EFLAGS),
+ (CMOVO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NO, EFLAGS),
+ (CMOVO32rm GR32:$src2, addr:$src1)>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+
+// extload bool -> extload byte
+def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>,
+ Requires<[In32BitMode]>;
+def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>,
+ Requires<[In32BitMode]>;
+def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// anyext
+def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8 GR8 :$src)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i32 (anyext GR16:$src)),
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, x86_subreg_16bit)>;
+
+// (and (i32 load), 255) -> (zextload i8)
+def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 255))),
+ (MOVZX32rm8 addr:$src)>;
+def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 65535))),
+ (MOVZX32rm16 addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+ (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+ (SUB16mi8 addr:$dst, -128)>;
+def : Pat<(add GR32:$src1, 128),
+ (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+ (SUB32mi8 addr:$dst, -128)>;
+
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+ (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, x86_subreg_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src1, GR32_ABCD),
+ x86_subreg_8bit))>,
+ Requires<[In32BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (MOVZX16rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD),
+ x86_subreg_8bit))>,
+ Requires<[In32BitMode]>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+ (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+ x86_subreg_8bit))>,
+ Requires<[In32BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+ (MOVSX16rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+ x86_subreg_8bit))>,
+ Requires<[In32BitMode]>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+ x86_subreg_8bit)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+ x86_subreg_8bit)>,
+ Requires<[In32BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+ x86_subreg_8bit_hi)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+ x86_subreg_8bit_hi)>,
+ Requires<[In32BitMode]>;
+def : Pat<(srl_su GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32rr8
+ (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+ x86_subreg_8bit_hi)),
+ x86_subreg_16bit)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR16:$src, GR16_ABCD),
+ x86_subreg_8bit_hi))>,
+ Requires<[In32BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_REGCLASS GR32:$src, GR32_ABCD),
+ x86_subreg_8bit_hi))>,
+ Requires<[In32BitMode]>;
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+
+// (shl x (and y, 31)) ==> (shl x, y)
+def : Pat<(shl GR8:$src1, (and CL:$amt, 31)),
+ (SHL8rCL GR8:$src1)>;
+def : Pat<(shl GR16:$src1, (and CL:$amt, 31)),
+ (SHL16rCL GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (and CL:$amt, 31)),
+ (SHL32rCL GR32:$src1)>;
+def : Pat<(store (shl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+ (SHL8mCL addr:$dst)>;
+def : Pat<(store (shl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+ (SHL16mCL addr:$dst)>;
+def : Pat<(store (shl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+ (SHL32mCL addr:$dst)>;
+
+def : Pat<(srl GR8:$src1, (and CL:$amt, 31)),
+ (SHR8rCL GR8:$src1)>;
+def : Pat<(srl GR16:$src1, (and CL:$amt, 31)),
+ (SHR16rCL GR16:$src1)>;
+def : Pat<(srl GR32:$src1, (and CL:$amt, 31)),
+ (SHR32rCL GR32:$src1)>;
+def : Pat<(store (srl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+ (SHR8mCL addr:$dst)>;
+def : Pat<(store (srl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+ (SHR16mCL addr:$dst)>;
+def : Pat<(store (srl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+ (SHR32mCL addr:$dst)>;
+
+def : Pat<(sra GR8:$src1, (and CL:$amt, 31)),
+ (SAR8rCL GR8:$src1)>;
+def : Pat<(sra GR16:$src1, (and CL:$amt, 31)),
+ (SAR16rCL GR16:$src1)>;
+def : Pat<(sra GR32:$src1, (and CL:$amt, 31)),
+ (SAR32rCL GR32:$src1)>;
+def : Pat<(store (sra (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+ (SAR8mCL addr:$dst)>;
+def : Pat<(store (sra (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+ (SAR16mCL addr:$dst)>;
+def : Pat<(store (sra (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+ (SAR32mCL addr:$dst)>;
+
+// (or (x >> c) | (y << (32 - c))) ==> (shrd32 x, y, c)
+def : Pat<(or (srl GR32:$src1, CL:$amt),
+ (shl GR32:$src2, (sub 32, CL:$amt))),
+ (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), CL:$amt),
+ (shl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+ (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(or (srl GR32:$src1, (i8 (trunc ECX:$amt))),
+ (shl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+ (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
+ (shl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+ addr:$dst),
+ (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(shrd GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+ (SHRD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi32 addr:$dst), (i8 imm:$amt1),
+ GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHRD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
+
+// (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c)
+def : Pat<(or (shl GR32:$src1, CL:$amt),
+ (srl GR32:$src2, (sub 32, CL:$amt))),
+ (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), CL:$amt),
+ (srl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+ (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(or (shl GR32:$src1, (i8 (trunc ECX:$amt))),
+ (srl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+ (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
+ (srl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+ addr:$dst),
+ (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(shld GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+ (SHLD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi32 addr:$dst), (i8 imm:$amt1),
+ GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHLD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
+
+// (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c)
+def : Pat<(or (srl GR16:$src1, CL:$amt),
+ (shl GR16:$src2, (sub 16, CL:$amt))),
+ (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), CL:$amt),
+ (shl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+ (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(or (srl GR16:$src1, (i8 (trunc CX:$amt))),
+ (shl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+ (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
+ (shl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+ addr:$dst),
+ (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(shrd GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+ (SHRD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi16 addr:$dst), (i8 imm:$amt1),
+ GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHRD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
+
+// (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c)
+def : Pat<(or (shl GR16:$src1, CL:$amt),
+ (srl GR16:$src2, (sub 16, CL:$amt))),
+ (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), CL:$amt),
+ (srl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+ (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(or (shl GR16:$src1, (i8 (trunc CX:$amt))),
+ (srl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+ (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
+ (srl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+ addr:$dst),
+ (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(shld GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+ (SHLD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi16 addr:$dst), (i8 imm:$amt1),
+ GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+ (SHLD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// Register-Register Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR8:$src1, GR8:$src2),
+ (implicit EFLAGS)),
+ (ADD8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, GR16:$src2),
+ (implicit EFLAGS)),
+ (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, GR32:$src2),
+ (implicit EFLAGS)),
+ (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR8:$src1, (loadi8 addr:$src2)),
+ (implicit EFLAGS)),
+ (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, (loadi16 addr:$src2)),
+ (implicit EFLAGS)),
+ (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, (loadi32 addr:$src2)),
+ (implicit EFLAGS)),
+ (ADD32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR8:$src1, imm:$src2),
+ (implicit EFLAGS)),
+ (ADD8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, imm:$src2),
+ (implicit EFLAGS)),
+ (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, imm:$src2),
+ (implicit EFLAGS)),
+ (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, i16immSExt8:$src2),
+ (implicit EFLAGS)),
+ (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, i32immSExt8:$src2),
+ (implicit EFLAGS)),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register Addition with EFLAGS result
+def : Pat<(parallel (store (X86add_flag (loadi8 addr:$dst), GR8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi16 addr:$dst), GR16:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi32 addr:$dst), GR32:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer Addition with EFLAGS result
+def : Pat<(parallel (store (X86add_flag (loadi8 addr:$dst), imm:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi16 addr:$dst), imm:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi32 addr:$dst), imm:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (ADD32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+// Register-Register Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR8:$src1, GR8:$src2),
+ (implicit EFLAGS)),
+ (SUB8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, GR16:$src2),
+ (implicit EFLAGS)),
+ (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, GR32:$src2),
+ (implicit EFLAGS)),
+ (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR8:$src1, (loadi8 addr:$src2)),
+ (implicit EFLAGS)),
+ (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, (loadi16 addr:$src2)),
+ (implicit EFLAGS)),
+ (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, (loadi32 addr:$src2)),
+ (implicit EFLAGS)),
+ (SUB32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR8:$src1, imm:$src2),
+ (implicit EFLAGS)),
+ (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, imm:$src2),
+ (implicit EFLAGS)),
+ (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, imm:$src2),
+ (implicit EFLAGS)),
+ (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, i16immSExt8:$src2),
+ (implicit EFLAGS)),
+ (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, i32immSExt8:$src2),
+ (implicit EFLAGS)),
+ (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi8 addr:$dst), GR8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi16 addr:$dst), GR16:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi32 addr:$dst), GR32:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi8 addr:$dst), imm:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi16 addr:$dst), imm:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi32 addr:$dst), imm:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)),
+ (SUB32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+
+// Register-Register Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR16:$src1, GR16:$src2),
+ (implicit EFLAGS)),
+ (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, GR32:$src2),
+ (implicit EFLAGS)),
+ (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR16:$src1, (loadi16 addr:$src2)),
+ (implicit EFLAGS)),
+ (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, (loadi32 addr:$src2)),
+ (implicit EFLAGS)),
+ (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR16:$src1, imm:$src2),
+ (implicit EFLAGS)),
+ (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, imm:$src2),
+ (implicit EFLAGS)),
+ (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag GR16:$src1, i16immSExt8:$src2),
+ (implicit EFLAGS)),
+ (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, i32immSExt8:$src2),
+ (implicit EFLAGS)),
+ (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Integer Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), imm:$src2),
+ (implicit EFLAGS)),
+ (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), imm:$src2),
+ (implicit EFLAGS)),
+ (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), i16immSExt8:$src2),
+ (implicit EFLAGS)),
+ (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), i32immSExt8:$src2),
+ (implicit EFLAGS)),
+ (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Optimize multiply by 2 with EFLAGS result.
+let AddedComplexity = 2 in {
+def : Pat<(parallel (X86smul_flag GR16:$src1, 2),
+ (implicit EFLAGS)),
+ (ADD16rr GR16:$src1, GR16:$src1)>;
+
+def : Pat<(parallel (X86smul_flag GR32:$src1, 2),
+ (implicit EFLAGS)),
+ (ADD32rr GR32:$src1, GR32:$src1)>;
+}
+
+// INC and DEC with EFLAGS result. Note that these do not set CF.
+def : Pat<(parallel (X86inc_flag GR8:$src), (implicit EFLAGS)),
+ (INC8r GR8:$src)>;
+def : Pat<(parallel (store (i8 (X86inc_flag (loadi8 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (INC8m addr:$dst)>;
+def : Pat<(parallel (X86dec_flag GR8:$src), (implicit EFLAGS)),
+ (DEC8r GR8:$src)>;
+def : Pat<(parallel (store (i8 (X86dec_flag (loadi8 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (DEC8m addr:$dst)>;
+
+def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
+ (INC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i16 (X86inc_flag (loadi16 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (INC16m addr:$dst)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
+ (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i16 (X86dec_flag (loadi16 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (DEC16m addr:$dst)>, Requires<[In32BitMode]>;
+
+def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
+ (INC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i32 (X86inc_flag (loadi32 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (INC32m addr:$dst)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
+ (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i32 (X86dec_flag (loadi32 addr:$dst))), addr:$dst),
+ (implicit EFLAGS)),
+ (DEC32m addr:$dst)>, Requires<[In32BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Stack Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFPStack.td"
+
+//===----------------------------------------------------------------------===//
+// X86-64 Support
+//===----------------------------------------------------------------------===//
+
+include "X86Instr64bit.td"
+
+//===----------------------------------------------------------------------===//
+// XMM Floating point support (requires SSE / SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrSSE.td"
+
+//===----------------------------------------------------------------------===//
+// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrMMX.td"
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..8f287e1
--- /dev/null
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,694 @@
+//====- X86InstrMMX.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (v1i64 (load node:$ptr))>;
+
+def bc_v8i8 : PatFrag<(ops node:$in), (v8i8 (bitconvert node:$in))>;
+def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>;
+def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>;
+def bc_v1i64 : PatFrag<(ops node:$in), (v1i64 (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// MMX Masks
+//===----------------------------------------------------------------------===//
+
+// MMX_SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to
+// PSHUFW imm.
+def MMX_SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
+ return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <2, 6, 3, 7, ...>
+def mmx_unpckh : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <0, 4, 2, 5, ...>
+def mmx_unpckl : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+def mmx_unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+def mmx_unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def mmx_pshufw : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
+}], MMX_SHUFFLE_get_shuf_imm>;
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let isTwoAddress = 1 in {
+ // MMXI_binop_rm - Simple MMX binary operator.
+ multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, bit Commutable = 0> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (OpVT (OpNode VR64:$src1,
+ (bitconvert
+ (load_mmx addr:$src2)))))]>;
+ }
+
+ multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ bit Commutable = 0> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>;
+ }
+
+ // MMXI_binop_rm_v1i64 - Simple MMX binary operator whose type is v1i64.
+ //
+ // FIXME: we could eliminate this and use MMXI_binop_rm instead if tblgen knew
+ // to collapse (bitconvert VT to VT) into its operand.
+ //
+ multiclass MMXI_binop_rm_v1i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit Commutable = 0> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (v1i64 (OpNode VR64:$src1, VR64:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (OpNode VR64:$src1,(load_mmx addr:$src2)))]>;
+ }
+
+ multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, Intrinsic IntId,
+ Intrinsic IntId2> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>;
+ def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+ (ins VR64:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS & FEMMS Instructions
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>;
+def MMX_FEMMS : MMXI<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (v2i32 (scalar_to_vector GR32:$src)))]>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (v2i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}", []>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ []>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMSrcReg,
+ (outs GR64:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}", []>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (load_mmx addr:$src))]>;
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (v1i64 VR64:$src), addr:$dst)]>;
+
+def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMDestMem, (outs VR64:$dst), (ins VR128:$src),
+ "movdq2q\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v1i64 (bitconvert
+ (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))))]>;
+
+def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMDestMem, (outs VR128:$dst), (ins VR64:$src),
+ "movq2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (movl immAllZerosV,
+ (v2i64 (scalar_to_vector (i64 (bitconvert VR64:$src))))))]>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMDestMem, (outs FR64:$dst), (ins VR64:$src),
+ "movq2dq\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movntq\t{$src, $dst|$dst, $src}",
+ [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+
+let AddedComplexity = 15 in
+// movd to MMX register zero-extends
+def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v2i32 (X86vzmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
+let AddedComplexity = 20 in
+def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v2i32 (X86vzmovl (v2i32
+ (scalar_to_vector (loadi32 addr:$src))))))]>;
+
+// Arithmetic Instructions
+
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm<0xFC, "paddb", add, v8i8, 1>;
+defm MMX_PADDW : MMXI_binop_rm<0xFD, "paddw", add, v4i16, 1>;
+defm MMX_PADDD : MMXI_binop_rm<0xFE, "paddd", add, v2i32, 1>;
+defm MMX_PADDQ : MMXI_binop_rm<0xD4, "paddq", add, v1i64, 1>;
+
+defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
+defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm<0xF8, "psubb", sub, v8i8>;
+defm MMX_PSUBW : MMXI_binop_rm<0xF9, "psubw", sub, v4i16>;
+defm MMX_PSUBD : MMXI_binop_rm<0xFA, "psubd", sub, v2i32>;
+defm MMX_PSUBQ : MMXI_binop_rm<0xFB, "psubq", sub, v1i64>;
+
+defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
+defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
+
+// -- Multiplication
+defm MMX_PMULLW : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>;
+
+defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, 1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+
+defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
+defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
+
+defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
+defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
+
+defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
+defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
+
+defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>;
+defm MMX_POR : MMXI_binop_rm_v1i64<0xEB, "por" , or, 1>;
+defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>;
+
+let isTwoAddress = 1 in {
+ def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "pandn\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+ VR64:$src2)))]>;
+ def MMX_PANDNrm : MMXI<0xDF, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "pandn\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+ (load addr:$src2))))]>;
+}
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+ int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+ int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+ int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+ int_x86_mmx_psll_w, int_x86_mmx_pslli_w>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+ int_x86_mmx_psll_d, int_x86_mmx_pslli_d>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+ int_x86_mmx_psll_q, int_x86_mmx_pslli_q>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+ int_x86_mmx_psra_w, int_x86_mmx_psrai_w>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+ int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
+
+// Shift up / down and insert zero's.
+def : Pat<(v1i64 (X86vshl VR64:$src, (i8 imm:$amt))),
+ (v1i64 (MMX_PSLLQri VR64:$src, imm:$amt))>;
+def : Pat<(v1i64 (X86vshr VR64:$src, (i8 imm:$amt))),
+ (v1i64 (MMX_PSRLQri VR64:$src, imm:$amt))>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+
+// Conversion Instructions
+
+// -- Unpack Instructions
+let isTwoAddress = 1 in {
+ // Unpack High Packed Data Instructions
+ def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpckhbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpckhbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (mmx_unpckh VR64:$src1,
+ (bc_v8i8 (load_mmx addr:$src2)))))]>;
+
+ def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpckhwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpckhwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_unpckh VR64:$src1,
+ (bc_v4i16 (load_mmx addr:$src2)))))]>;
+
+ def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpckhdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpckhdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (mmx_unpckh VR64:$src1,
+ (bc_v2i32 (load_mmx addr:$src2)))))]>;
+
+ // Unpack Low Packed Data Instructions
+ def MMX_PUNPCKLBWrr : MMXI<0x60, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpcklbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKLBWrm : MMXI<0x60, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpcklbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (mmx_unpckl VR64:$src1,
+ (bc_v8i8 (load_mmx addr:$src2)))))]>;
+
+ def MMX_PUNPCKLWDrr : MMXI<0x61, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpcklwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKLWDrm : MMXI<0x61, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpcklwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_unpckl VR64:$src1,
+ (bc_v4i16 (load_mmx addr:$src2)))))]>;
+
+ def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpckldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpckldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (mmx_unpckl VR64:$src1,
+ (bc_v2i32 (load_mmx addr:$src2)))))]>;
+}
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+
+// -- Shuffle Instructions
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_pshufw:$src2 VR64:$src1, (undef))))]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+ (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (mmx_pshufw:$src2 (bc_v4i16 (load_mmx addr:$src1)),
+ (undef)))]>;
+
+// -- Conversion Instructions
+let neverHasSideEffects = 1 in {
+def MMX_CVTPD2PIrr : MMX2I<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPD2PIrm : MMX2I<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PDrr : MMX2I<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPI2PDrm : MMX2I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PSrr : MMXI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+ "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPI2PSrm : MMXI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPS2PIrr : MMXI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPS2PIrm : MMXI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
+} // end neverHasSideEffects
+
+
+// Extract / Insert
+def MMX_X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
+def MMX_X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+
+def MMX_PEXTRWri : MMXIi8<0xC5, MRMSrcReg,
+ (outs GR32:$dst), (ins VR64:$src1, i16i8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1),
+ (iPTR imm:$src2)))]>;
+let isTwoAddress = 1 in {
+ def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, GR32:$src2, i16i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+ GR32:$src2, (iPTR imm:$src3))))]>;
+ def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i16mem:$src2, i16i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst,
+ (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+ (i32 (anyext (loadi16 addr:$src2))),
+ (iPTR imm:$src3))))]>;
+}
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+// Misc.
+let Uses = [EDI] in
+def MMX_MASKMOVQ : MMXI<0xF7, MRMDestMem, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+let Uses = [RDI] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMDestMem, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map zero vector to pxor.
+let isReMaterializable = 1 in {
+ def MMX_V_SET0 : MMXI<0xEF, MRMInitReg, (outs VR64:$dst), (ins),
+ "pxor\t$dst, $dst",
+ [(set VR64:$dst, (v2i32 immAllZerosV))]>;
+ def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (outs VR64:$dst), (ins),
+ "pcmpeqd\t$dst, $dst",
+ [(set VR64:$dst, (v2i32 immAllOnesV))]>;
+}
+
+let Predicates = [HasMMX] in {
+ def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
+ def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
+ def : Pat<(v8i8 immAllZerosV), (MMX_V_SET0)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Store 64-bit integer vector values.
+def : Pat<(store (v8i8 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v4i16 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2i32 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2f32 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v1i64 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+
+// Bit convert.
+def : Pat<(v8i8 (bitconvert (v1i64 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2i32 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2f32 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v4i16 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1i64 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8 VR64:$src))), (v1i64 VR64:$src)>;
+
+// 64-bit bit convert.
+def : Pat<(v1i64 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2f32 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v8i8 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(i64 (bitconvert (v1i64 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v2i32 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v2f32 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v4i16 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v8i8 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v1i64 VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v2i32 VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v4i16 VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v8i8 VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+
+// Move scalar to MMX zero-extended
+// movd to MMX register zero-extends
+let AddedComplexity = 15 in {
+ def : Pat<(v8i8 (X86vzmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
+ (MMX_MOVZDI2PDIrr GR32:$src)>;
+ def : Pat<(v4i16 (X86vzmovl (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))))),
+ (MMX_MOVZDI2PDIrr GR32:$src)>;
+}
+
+let AddedComplexity = 20 in {
+ def : Pat<(v8i8 (X86vzmovl (bc_v8i8 (load_mmx addr:$src)))),
+ (MMX_MOVZDI2PDIrm addr:$src)>;
+ def : Pat<(v4i16 (X86vzmovl (bc_v4i16 (load_mmx addr:$src)))),
+ (MMX_MOVZDI2PDIrm addr:$src)>;
+ def : Pat<(v2i32 (X86vzmovl (bc_v2i32 (load_mmx addr:$src)))),
+ (MMX_MOVZDI2PDIrm addr:$src)>;
+}
+
+// Clear top half.
+let AddedComplexity = 15 in {
+ def : Pat<(v8i8 (X86vzmovl VR64:$src)),
+ (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
+ def : Pat<(v4i16 (X86vzmovl VR64:$src)),
+ (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
+ def : Pat<(v2i32 (X86vzmovl VR64:$src)),
+ (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
+}
+
+// Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower
+// 8 or 16-bits matter.
+def : Pat<(bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))),
+ (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))),
+ (MMX_MOVD64rr GR32:$src)>;
+
+// Patterns to perform canonical versions of vector shuffling.
+let AddedComplexity = 10 in {
+ def : Pat<(v8i8 (mmx_unpckl_undef VR64:$src, (undef))),
+ (MMX_PUNPCKLBWrr VR64:$src, VR64:$src)>;
+ def : Pat<(v4i16 (mmx_unpckl_undef VR64:$src, (undef))),
+ (MMX_PUNPCKLWDrr VR64:$src, VR64:$src)>;
+ def : Pat<(v2i32 (mmx_unpckl_undef VR64:$src, (undef))),
+ (MMX_PUNPCKLDQrr VR64:$src, VR64:$src)>;
+}
+
+let AddedComplexity = 10 in {
+ def : Pat<(v8i8 (mmx_unpckh_undef VR64:$src, (undef))),
+ (MMX_PUNPCKHBWrr VR64:$src, VR64:$src)>;
+ def : Pat<(v4i16 (mmx_unpckh_undef VR64:$src, (undef))),
+ (MMX_PUNPCKHWDrr VR64:$src, VR64:$src)>;
+ def : Pat<(v2i32 (mmx_unpckh_undef VR64:$src, (undef))),
+ (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
+}
+
+// Patterns to perform vector shuffling with a zeroed out vector.
+let AddedComplexity = 20 in {
+ def : Pat<(bc_v2i32 (mmx_unpckl immAllZerosV,
+ (v2i32 (scalar_to_vector (load_mmx addr:$src))))),
+ (MMX_PUNPCKLDQrm VR64:$src, VR64:$src)>;
+}
+
+// Some special case PANDN patterns.
+// FIXME: Get rid of these.
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+ VR64:$src2)),
+ (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
+ VR64:$src2)),
+ (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV_bc))),
+ VR64:$src2)),
+ (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+ (load addr:$src2))),
+ (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
+ (load addr:$src2))),
+ (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV_bc))),
+ (load addr:$src2))),
+ (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+
+// Move MMX to lower 64-bit of XMM
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v8i8 VR64:$src))))),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v4i16 VR64:$src))))),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v2i32 VR64:$src))))),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v1i64 VR64:$src))))),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
+// Move lower 64-bit of XMM to MMX.
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+// CMOV* - Used to implement the SELECT DAG operation. Expanded by the
+// scheduler into a branch sequence.
+// These are expanded by the scheduler.
+let Uses = [EFLAGS], usesCustomDAGSchedInserter = 1 in {
+ def CMOV_V1I64 : I<0, Pseudo,
+ (outs VR64:$dst), (ins VR64:$t, VR64:$f, i8imm:$cond),
+ "#CMOV_V1I64 PSEUDO!",
+ [(set VR64:$dst,
+ (v1i64 (X86cmov VR64:$t, VR64:$f, imm:$cond,
+ EFLAGS)))]>;
+}
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..1fafa46
--- /dev/null
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,3643 @@
+//====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+ SDTCisFP<0>, SDTCisInt<2> ]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+ SDTCisFP<1>, SDTCisVT<3, i8>]>;
+
+def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
+def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
+def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
+def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
+def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
+def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
+def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
+def X86pshufb : SDNode<"X86ISD::PSHUFB",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86pextrb : SDNode<"X86ISD::PEXTRB",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pextrw : SDNode<"X86ISD::PEXTRW",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pinsrb : SDNode<"X86ISD::PINSRB",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw : SDNode<"X86ISD::PINSRW",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insrtps : SDNode<"X86ISD::INSERTPS",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>;
+def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad]>;
+def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>;
+def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>;
+def X86cmpps : SDNode<"X86ISD::CMPPS", SDTX86VFCMP>;
+def X86cmppd : SDNode<"X86ISD::CMPPD", SDTX86VFCMP>;
+def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>;
+def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
+def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
+def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements. These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad]>;
+
+def ssmem : Operand<v4f32> {
+ let PrintMethod = "printf32mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm, i8imm);
+}
+def sdmem : Operand<v2f64> {
+ let PrintMethod = "printf64mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm, i8imm);
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// Like 'store', but always requires vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr), (f32 (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr), (f64 (alignedload node:$ptr))>;
+def alignedloadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (alignedload node:$ptr))>;
+def alignedloadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (alignedload node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others.
+// FIXME: Actually implement support for targets that don't require the
+// alignment. This probably wants a subtarget predicate.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>;
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopv8i8 : PatFrag<(ops node:$ptr), (v8i8 (memop64 node:$ptr))>;
+def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
+def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
+def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
+
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzmovl
+ (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def PSxLDQ_imm : SDNodeXForm<imm, [{
+ // Transformation function: imm >> 3
+ return getI32Imm(N->getZExtValue() >> 3);
+}]>;
+
+// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
+// SHUFP* etc. imm.
+def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
+ return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
+// PSHUFHW imm.
+def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{
+ return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
+// PSHUFLW imm.
+def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
+ return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
+}]>;
+
+def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
+}]>;
+
+def movddup : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhp : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVHPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movlp : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movl : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movshdup : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVSHDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movsldup : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVSLDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def shufp : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def pshufhw : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshufhw_imm>;
+
+def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshuflw_imm>;
+
+//===----------------------------------------------------------------------===//
+// SSE scalar FP Instructions
+//===----------------------------------------------------------------------===//
+
+// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded by the
+// scheduler into a branch sequence.
+// These are expanded by the scheduler.
+let Uses = [EFLAGS], usesCustomDAGSchedInserter = 1 in {
+ def CMOV_FR32 : I<0, Pseudo,
+ (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
+ "#CMOV_FR32 PSEUDO!",
+ [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
+ EFLAGS))]>;
+ def CMOV_FR64 : I<0, Pseudo,
+ (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
+ "#CMOV_FR64 PSEUDO!",
+ [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
+ EFLAGS))]>;
+ def CMOV_V4F32 : I<0, Pseudo,
+ (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V4F32 PSEUDO!",
+ [(set VR128:$dst,
+ (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+ EFLAGS)))]>;
+ def CMOV_V2F64 : I<0, Pseudo,
+ (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V2F64 PSEUDO!",
+ [(set VR128:$dst,
+ (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+ EFLAGS)))]>;
+ def CMOV_V2I64 : I<0, Pseudo,
+ (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V2I64 PSEUDO!",
+ [(set VR128:$dst,
+ (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+ EFLAGS)))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE1 Instructions
+//===----------------------------------------------------------------------===//
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVSSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ "movss\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+ "movss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (loadf32 addr:$src))]>;
+def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
+ "movss\t{$src, $dst|$dst, $src}",
+ [(store FR32:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def CVTSI2SSrr : SSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "cvtsi2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SSrm : SSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "cvtsi2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "cvtss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
+def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+ "cvtss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_cvtss2si
+ (load addr:$src)))]>;
+
+// Match intrinisics which expect MM and XMM operand(s).
+def Int_CVTPS2PIrr : PSI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtps2pi VR128:$src))]>;
+def Int_CVTPS2PIrm : PSI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtps2pi
+ (load addr:$src)))]>;
+def Int_CVTTPS2PIrr: PSI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttps2pi VR128:$src))]>;
+def Int_CVTTPS2PIrm: PSI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttps2pi
+ (load addr:$src)))]>;
+let Constraints = "$src1 = $dst" in {
+ def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR64:$src2),
+ "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
+ VR64:$src2))]>;
+ def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+ "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
+ (load addr:$src2)))]>;
+}
+
+// Aliases for intrinsics
+def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse_cvttss2si VR128:$src))]>;
+def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse_cvttss2si(load addr:$src)))]>;
+
+let Constraints = "$src1 = $dst" in {
+ def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
+ "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+ GR32:$src2))]>;
+ def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
+ "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+ (loadi32 addr:$src2)))]>;
+}
+
+// Comparison instructions
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+ def CMPSSrr : SSIi8<0xC2, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src, SSECC:$cc),
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+ def CMPSSrm : SSIi8<0xC2, MRMSrcMem,
+ (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, SSECC:$cc),
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
+}
+
+let Defs = [EFLAGS] in {
+def UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins FR32:$src1, FR32:$src2),
+ "ucomiss\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp FR32:$src1, FR32:$src2), (implicit EFLAGS)]>;
+def UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs), (ins FR32:$src1, f32mem:$src2),
+ "ucomiss\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp FR32:$src1, (loadf32 addr:$src2)),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let Constraints = "$src1 = $dst" in {
+ def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f32mem:$src, SSECC:$cc),
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+let Defs = [EFLAGS] in {
+def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ucomiss\t{$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v4f32 VR128:$src1), VR128:$src2),
+ (implicit EFLAGS)]>;
+def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
+ "ucomiss\t{$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2)),
+ (implicit EFLAGS)]>;
+
+def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "comiss\t{$src2, $src1|$src1, $src2}",
+ [(X86comi (v4f32 VR128:$src1), VR128:$src2),
+ (implicit EFLAGS)]>;
+def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "comiss\t{$src2, $src1|$src1, $src2}",
+ [(X86comi (v4f32 VR128:$src1), (load addr:$src2)),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases of packed SSE1 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins),
+ "pxor\t$dst, $dst", [(set FR32:$dst, fp32imm0)]>,
+ Requires<[HasSSE1]>, TB, OpSize;
+
+// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
+// disregarded.
+let neverHasSideEffects = 1 in
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ "movaps\t{$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
+// disregarded.
+let canFoldAsLoad = 1 in
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in {
+ def FsANDPSrr : PSI<0x54, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ "andps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
+ def FsORPSrr : PSI<0x56, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ "orps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
+ def FsXORPSrr : PSI<0x57, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ "xorps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
+}
+
+def FsANDPSrm : PSI<0x54, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f128mem:$src2),
+ "andps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fand FR32:$src1,
+ (memopfsf32 addr:$src2)))]>;
+def FsORPSrm : PSI<0x56, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f128mem:$src2),
+ "orps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86for FR32:$src1,
+ (memopfsf32 addr:$src2)))]>;
+def FsXORPSrm : PSI<0x57, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f128mem:$src2),
+ "xorps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fxor FR32:$src1,
+ (memopfsf32 addr:$src2)))]>;
+
+let neverHasSideEffects = 1 in {
+def FsANDNPSrr : PSI<0x55, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+ "andnps\t{$src2, $dst|$dst, $src2}", []>;
+let mayLoad = 1 in
+def FsANDNPSrm : PSI<0x55, MRMSrcMem,
+ (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
+ "andnps\t{$src2, $dst|$dst, $src2}", []>;
+}
+}
+
+/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, Intrinsic F32Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg+reg.
+ def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f32mem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]>;
+
+ // Intrinsic operation, reg+mem.
+ def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1,
+ sse_load_f32:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
+defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
+defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
+defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
+
+/// sse1_fp_binop_rm - Other SSE1 binops
+///
+/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form. Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F32Int,
+ Intrinsic V4F32Int,
+ bit Commutable = 0> {
+
+ // Scalar operation, reg+reg.
+ def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f32mem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, reg+mem.
+ def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1,
+ sse_load_f32:$src2))]>;
+
+ // Vector intrinsic operation, reg+reg.
+ def PSrr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, reg+mem.
+ def PSrm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src1, (memopv4f32 addr:$src2)))]>;
+}
+}
+
+defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
+ int_x86_sse_max_ss, int_x86_sse_max_ps>;
+defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
+ int_x86_sse_min_ss, int_x86_sse_min_ps>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
+
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
+
+let neverHasSideEffects = 1 in
+def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1 in
+def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv4f32 addr:$src))]>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)]>;
+
+// Intrinsic forms of MOVUPS load and store
+let canFoldAsLoad = 1 in
+def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
+def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
+
+let Constraints = "$src1 = $dst" in {
+ let AddedComplexity = 20 in {
+ def MOVLPSrm : PSI<0x12, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "movlps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (movlp VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
+ def MOVHPSrm : PSI<0x16, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "movhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (movhp VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
+ } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (undef)), (iPTR 0))), addr:$dst)]>;
+
+let Constraints = "$src1 = $dst" in {
+let AddedComplexity = 20 in {
+def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (movhp VR128:$src1, VR128:$src2)))]>;
+
+def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
+} // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+let AddedComplexity = 20 in {
+def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+ (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+ (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+
+
+// Arithmetic
+
+/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F32Int,
+ Intrinsic V4F32Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg.
+ def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode FR32:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, mem.
+ def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode (load addr:$src)))]>;
+
+ // Vector operation, reg.
+ def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, mem.
+ def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
+
+ // Intrinsic operation, reg.
+ def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F32Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, mem.
+ def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+
+ // Vector intrinsic operation, reg
+ def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, mem
+ def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT : sse1_fp_unop_rm<0x51, "sqrt", fsqrt,
+ int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
+ int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
+defm RCP : sse1_fp_unop_rm<0x53, "rcp", X86frcp,
+ int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
+
+// Logical
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 1 in {
+ def ANDPSrr : PSI<0x54, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "andps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (and VR128:$src1, VR128:$src2)))]>;
+ def ORPSrr : PSI<0x56, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "orps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (or VR128:$src1, VR128:$src2)))]>;
+ def XORPSrr : PSI<0x57, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "xorps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (xor VR128:$src1, VR128:$src2)))]>;
+ }
+
+ def ANDPSrm : PSI<0x54, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "andps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (and (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def ORPSrm : PSI<0x56, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "orps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (or (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def XORPSrm : PSI<0x57, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "xorps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (xor (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def ANDNPSrr : PSI<0x55, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "andnps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (and (xor VR128:$src1,
+ (bc_v2i64 (v4i32 immAllOnesV))),
+ VR128:$src2)))]>;
+ def ANDNPSrm : PSI<0x55, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
+ "andnps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)),
+ (bc_v2i64 (v4i32 immAllOnesV))),
+ (memopv2i64 addr:$src2))))]>;
+}
+
+let Constraints = "$src1 = $dst" in {
+ def CMPPSrri : PSIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def CMPPSrmi : PSIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
+ "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+ (memop addr:$src), imm:$cc))]>;
+}
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPSrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+// Shuffle and unpack instructions
+let Constraints = "$src1 = $dst" in {
+ let isConvertibleToThreeAddress = 1 in // Convert to pshufd
+ def SHUFPSrri : PSIi8<0xC6, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1,
+ VR128:$src2, i8imm:$src3),
+ "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v4f32 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
+ def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ f128mem:$src2, i8imm:$src3),
+ "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v4f32 (shufp:$src3
+ VR128:$src1, (memopv4f32 addr:$src2))))]>;
+
+ let AddedComplexity = 10 in {
+ def UNPCKHPSrr : PSI<0x15, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "unpckhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def UNPCKHPSrm : PSI<0x15, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "unpckhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (unpckh VR128:$src1,
+ (memopv4f32 addr:$src2))))]>;
+
+ def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "unpcklps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def UNPCKLPSrm : PSI<0x14, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "unpcklps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1, (memopv4f32 addr:$src2)))]>;
+ } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+// Mask creation
+def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "movmskps\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
+def MOVMSKPDrr : PDI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "movmskpd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
+
+// Prefetch intrinsic.
+def PREFETCHT0 : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
+ "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>;
+def PREFETCHT1 : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
+ "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>;
+def PREFETCHT2 : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
+ "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>;
+def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
+ "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
+
+// Non-temporal stores
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+
+// Load, store, and memory fence
+def SFENCE : PSI<0xAE, MRM7r, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>;
+
+// MXCSR register
+def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins),
+ "xorps\t$dst, $dst",
+ [(set VR128:$dst, (v4i32 immAllZerosV))]>;
+
+let Predicates = [HasSSE1] in {
+ def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+ def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+ def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+ def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+ def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
+}
+
+// FR32 to 128-bit vector conversion.
+let isAsCheapAsAMove = 1 in
+def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
+ "movss\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4f32 (scalar_to_vector FR32:$src)))]>;
+def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
+ "movss\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+// (f32 FR32:$src)>;
+let isAsCheapAsAMove = 1 in
+def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins VR128:$src),
+ "movss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
+ (iPTR 0)))]>;
+def MOVPS2SSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+ "movss\t{$src, $dst|$dst, $src}",
+ [(store (f32 (vector_extract (v4f32 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let Constraints = "$src1 = $dst" in {
+let neverHasSideEffects = 1 in
+ def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
+ "movss\t{$src2, $dst|$dst, $src2}", []>;
+
+ let AddedComplexity = 15 in
+ def MOVLPSrr : SSI<0x10, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "movss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (movl VR128:$src1, VR128:$src2)))]>;
+}
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in
+def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
+ "movss\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (X86vzmovl (v4f32 (scalar_to_vector
+ (loadf32 addr:$src))))))]>;
+
+def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (MOVZSS2PSrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// SSE2 Instructions
+//===----------------------------------------------------------------------===//
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVSDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+ "movsd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+ "movsd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (loadf64 addr:$src))]>;
+def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
+ "movsd\t{$src, $dst|$dst, $src}",
+ [(store FR64:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src),
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround FR64:$src))]>;
+def CVTSD2SSrm : SDI<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround (loadf64 addr:$src)))]>;
+def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src),
+ "cvtsi2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SDrm : SDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i32mem:$src),
+ "cvtsi2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (fextend FR32:$src))]>, XS,
+ Requires<[HasSSE2]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
+ Requires<[HasSSE2]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "cvtsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
+def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
+ "cvtsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_cvtsd2si
+ (load addr:$src)))]>;
+
+// Match intrinisics which expect MM and XMM operand(s).
+def Int_CVTPD2PIrr : PDI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtpd2pi VR128:$src))]>;
+def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtpd2pi
+ (memop addr:$src)))]>;
+def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
+def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttpd2pi
+ (memop addr:$src)))]>;
+def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
+def Int_CVTPI2PDrm : PDI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2pd
+ (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse2_cvttsd2si VR128:$src))]>;
+def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_cvttsd2si
+ (load addr:$src)))]>;
+
+// Comparison instructions
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+ def CMPSDrr : SDIi8<0xC2, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src, SSECC:$cc),
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+ def CMPSDrm : SDIi8<0xC2, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, SSECC:$cc),
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
+}
+
+let Defs = [EFLAGS] in {
+def UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins FR64:$src1, FR64:$src2),
+ "ucomisd\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp FR64:$src1, FR64:$src2), (implicit EFLAGS)]>;
+def UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs), (ins FR64:$src1, f64mem:$src2),
+ "ucomisd\t{$src2, $src1|$src1, $src2}",
+ [(X86cmp FR64:$src1, (loadf64 addr:$src2)),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let Constraints = "$src1 = $dst" in {
+ def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src, SSECC:$cc),
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+let Defs = [EFLAGS] in {
+def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ucomisd\t{$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v2f64 VR128:$src1), (v2f64 VR128:$src2)),
+ (implicit EFLAGS)]>;
+def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
+ "ucomisd\t{$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v2f64 VR128:$src1), (load addr:$src2)),
+ (implicit EFLAGS)]>;
+
+def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "comisd\t{$src2, $src1|$src1, $src2}",
+ [(X86comi (v2f64 VR128:$src1), (v2f64 VR128:$src2)),
+ (implicit EFLAGS)]>;
+def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "comisd\t{$src2, $src1|$src1, $src2}",
+ [(X86comi (v2f64 VR128:$src1), (load addr:$src2)),
+ (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases of packed SSE2 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins),
+ "pxor\t$dst, $dst", [(set FR64:$dst, fpimm0)]>,
+ Requires<[HasSSE2]>, TB, OpSize;
+
+// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
+// disregarded.
+let neverHasSideEffects = 1 in
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+ "movapd\t{$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
+// disregarded.
+let canFoldAsLoad = 1 in
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in {
+ def FsANDPDrr : PDI<0x54, MRMSrcReg, (outs FR64:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ "andpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
+ def FsORPDrr : PDI<0x56, MRMSrcReg, (outs FR64:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ "orpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
+ def FsXORPDrr : PDI<0x57, MRMSrcReg, (outs FR64:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ "xorpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
+}
+
+def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f128mem:$src2),
+ "andpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fand FR64:$src1,
+ (memopfsf64 addr:$src2)))]>;
+def FsORPDrm : PDI<0x56, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f128mem:$src2),
+ "orpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86for FR64:$src1,
+ (memopfsf64 addr:$src2)))]>;
+def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f128mem:$src2),
+ "xorpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fxor FR64:$src1,
+ (memopfsf64 addr:$src2)))]>;
+
+let neverHasSideEffects = 1 in {
+def FsANDNPDrr : PDI<0x55, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+ "andnpd\t{$src2, $dst|$dst, $src2}", []>;
+let mayLoad = 1 in
+def FsANDNPDrm : PDI<0x55, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
+ "andnpd\t{$src2, $dst|$dst, $src2}", []>;
+}
+}
+
+/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, Intrinsic F64Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg+reg.
+ def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]>;
+
+ // Intrinsic operation, reg+mem.
+ def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, sdmem:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1,
+ sse_load_f64:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
+defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
+defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
+defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
+
+/// sse2_fp_binop_rm - Other SSE2 binops
+///
+/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form. Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F64Int,
+ Intrinsic V2F64Int,
+ bit Commutable = 0> {
+
+ // Scalar operation, reg+reg.
+ def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, reg+mem.
+ def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, sdmem:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1,
+ sse_load_f64:$src2))]>;
+
+ // Vector intrinsic operation, reg+reg.
+ def PDrr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, reg+mem.
+ def PDrm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src1,
+ (memopv2f64 addr:$src2)))]>;
+}
+}
+
+defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
+ int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
+defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
+ int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
+
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
+
+let neverHasSideEffects = 1 in
+def MOVUPDrr : PDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1 in
+def MOVUPDrm : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv2f64 addr:$src))]>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)]>;
+
+// Intrinsic forms of MOVUPD load and store
+def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
+
+let Constraints = "$src1 = $dst" in {
+ let AddedComplexity = 20 in {
+ def MOVLPDrm : PDI<0x12, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "movlpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (movlp VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))))]>;
+ def MOVHPDrm : PDI<0x16, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "movhpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (movhp VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))))]>;
+ } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (v2f64 (unpckh VR128:$src, (undef))),
+ (iPTR 0))), addr:$dst)]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+ TB, Requires<[HasSSE2]>;
+def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "cvtdq2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
+ (bitconvert (memopv2i64 addr:$src))))]>,
+ TB, Requires<[HasSSE2]>;
+
+// SSE2 instructions with XS prefix
+def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+ XS, Requires<[HasSSE2]>;
+def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
+ (bitconvert (memopv2i64 addr:$src))))]>,
+ XS, Requires<[HasSSE2]>;
+
+def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
+def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq
+ (memop addr:$src)))]>;
+// SSE2 packed instructions with XS prefix
+def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))]>,
+ XS, Requires<[HasSSE2]>;
+def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+ (memop addr:$src)))]>,
+ XS, Requires<[HasSSE2]>;
+
+// SSE2 packed instructions with XD prefix
+def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+ XD, Requires<[HasSSE2]>;
+def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
+ (memop addr:$src)))]>,
+ XD, Requires<[HasSSE2]>;
+
+def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
+def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+ (memop addr:$src)))]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+ TB, Requires<[HasSSE2]>;
+def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd
+ (load addr:$src)))]>,
+ TB, Requires<[HasSSE2]>;
+
+def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
+ (memop addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+// Aliases for intrinsics
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
+ "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+ GR32:$src2))]>;
+def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
+ "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+ (loadi32 addr:$src2)))]>;
+def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+ VR128:$src2))]>;
+def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+ (load addr:$src2)))]>;
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+ VR128:$src2))]>, XS,
+ Requires<[HasSSE2]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+ (load addr:$src2)))]>, XS,
+ Requires<[HasSSE2]>;
+}
+
+// Arithmetic
+
+/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F64Int,
+ Intrinsic V2F64Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg.
+ def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set FR64:$dst, (OpNode FR64:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, mem.
+ def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set FR64:$dst, (OpNode (load addr:$src)))]>;
+
+ // Vector operation, reg.
+ def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, mem.
+ def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
+
+ // Intrinsic operation, reg.
+ def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F64Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, mem.
+ def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+
+ // Vector intrinsic operation, reg
+ def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, mem
+ def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT : sse2_fp_unop_rm<0x51, "sqrt", fsqrt,
+ int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// Logical
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 1 in {
+ def ANDPDrr : PDI<0x54, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "andpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ def ORPDrr : PDI<0x56, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "orpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (or (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ def XORPDrr : PDI<0x57, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "xorpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (xor (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ }
+
+ def ANDPDrm : PDI<0x54, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "andpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def ORPDrm : PDI<0x56, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "orpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (or (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def XORPDrm : PDI<0x57, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "xorpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (xor (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def ANDNPDrr : PDI<0x55, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "andnpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ def ANDNPDrm : PDI<0x55, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
+ "andnpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+ (memopv2i64 addr:$src2)))]>;
+}
+
+let Constraints = "$src1 = $dst" in {
+ def CMPPDrri : PDIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def CMPPDrmi : PDIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
+ "cmp${cc}pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+ (memop addr:$src), imm:$cc))]>;
+}
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+// Shuffle and unpack instructions
+let Constraints = "$src1 = $dst" in {
+ def SHUFPDrri : PDIi8<0xC6, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v2f64 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
+ def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ f128mem:$src2, i8imm:$src3),
+ "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v2f64 (shufp:$src3
+ VR128:$src1, (memopv2f64 addr:$src2))))]>;
+
+ let AddedComplexity = 10 in {
+ def UNPCKHPDrr : PDI<0x15, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "unpckhpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def UNPCKHPDrm : PDI<0x15, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "unpckhpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (unpckh VR128:$src1,
+ (memopv2f64 addr:$src2))))]>;
+
+ def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "unpcklpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def UNPCKLPDrm : PDI<0x14, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "unpcklpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1, (memopv2f64 addr:$src2)))]>;
+ } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+
+//===----------------------------------------------------------------------===//
+// SSE integer instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, mayLoad = 1 in
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
+let mayStore = 1 in
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
+let canFoldAsLoad = 1, mayLoad = 1 in
+def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
+ XS, Requires<[HasSSE2]>;
+let mayStore = 1 in
+def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
+ XS, Requires<[HasSSE2]>;
+
+// Intrinsic forms of MOVDQU load and store
+let canFoldAsLoad = 1 in
+def MOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
+ XS, Requires<[HasSSE2]>;
+def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+ XS, Requires<[HasSSE2]>;
+
+let Constraints = "$src1 = $dst" in {
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ bit Commutable = 0> {
+ def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1,
+ (bitconvert (memopv2i64 addr:$src2))))]>;
+}
+
+multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr,
+ Intrinsic IntId, Intrinsic IntId2> {
+ def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+ def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1,
+ (bitconvert (memopv2i64 addr:$src2))))]>;
+ def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
+}
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, bit Commutable = 0> {
+ def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
+ (bitconvert (memopv2i64 addr:$src2)))))]>;
+}
+
+/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
+///
+/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
+/// to collapse (bitconvert VT to VT) into its operand.
+///
+multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit Commutable = 0> {
+ def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1,(memopv2i64 addr:$src2)))]>;
+}
+
+} // Constraints = "$src1 = $dst"
+
+// 128-bit Integer Arithmetic
+
+defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
+defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
+defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
+defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
+
+defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
+defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
+defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
+defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
+
+defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
+defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
+defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
+defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
+
+defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
+defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
+defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
+defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
+
+defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
+
+defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
+defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
+defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
+
+defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
+
+defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
+defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
+
+
+defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
+defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
+defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
+defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
+defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
+
+
+defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+ int_x86_sse2_psll_w, int_x86_sse2_pslli_w>;
+defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+ int_x86_sse2_psll_d, int_x86_sse2_pslli_d>;
+defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+ int_x86_sse2_psll_q, int_x86_sse2_pslli_q>;
+
+defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+ int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>;
+defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+ int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>;
+defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+ int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>;
+
+defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+ int_x86_sse2_psra_w, int_x86_sse2_psrai_w>;
+defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+ int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
+
+// 128-bit logical shifts.
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+ def PSLLDQri : PDIi8<0x73, MRM7r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "pslldq\t{$src2, $dst|$dst, $src2}", []>;
+ def PSRLDQri : PDIi8<0x73, MRM3r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "psrldq\t{$src2, $dst|$dst, $src2}", []>;
+ // PSRADQri doesn't exist in SSE[1-3].
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+ (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+ def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+ (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+ def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
+ (v2i64 (PSLLDQri VR128:$src1, imm:$src2))>;
+ def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
+ (v2i64 (PSRLDQri VR128:$src1, imm:$src2))>;
+ def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+ (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+
+ // Shift up / down and insert zero's.
+ def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))),
+ (v2i64 (PSLLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
+ def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))),
+ (v2i64 (PSRLDQri VR128:$src, (PSxLDQ_imm imm:$amt)))>;
+}
+
+// Logical
+defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
+defm POR : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
+defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
+
+let Constraints = "$src1 = $dst" in {
+ def PANDNrr : PDI<0xDF, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "pandn\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+ VR128:$src2)))]>;
+
+ def PANDNrm : PDI<0xDF, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "pandn\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+ (memopv2i64 addr:$src2))))]>;
+}
+
+// SSE2 Integer comparison
+defm PCMPEQB : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
+defm PCMPEQW : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
+defm PCMPEQD : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
+defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
+defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
+defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
+
+def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
+ (PCMPEQBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+ (PCMPEQBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
+ (PCMPEQWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+ (PCMPEQWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
+ (PCMPEQDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+ (PCMPEQDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
+ (PCMPGTBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+ (PCMPGTBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
+ (PCMPGTWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+ (PCMPGTWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
+ (PCMPGTDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+ (PCMPGTDrm VR128:$src1, addr:$src2)>;
+
+
+// Pack instructions
+defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
+defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
+defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
+
+// Shuffle and unpack instructions
+def PSHUFDri : PDIi8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+ "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v4i32 (pshufd:$src2
+ VR128:$src1, (undef))))]>;
+def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+ "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v4i32 (pshufd:$src2
+ (bc_v4i32(memopv2i64 addr:$src1)),
+ (undef))))]>;
+
+// SSE2 with ImmT == Imm8 and XS prefix.
+def PSHUFHWri : Ii8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+ "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (pshufhw:$src2 VR128:$src1,
+ (undef))))]>,
+ XS, Requires<[HasSSE2]>;
+def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+ "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (pshufhw:$src2
+ (bc_v8i16 (memopv2i64 addr:$src1)),
+ (undef))))]>,
+ XS, Requires<[HasSSE2]>;
+
+// SSE2 with ImmT == Imm8 and XD prefix.
+def PSHUFLWri : Ii8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+ "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (pshuflw:$src2 VR128:$src1,
+ (undef))))]>,
+ XD, Requires<[HasSSE2]>;
+def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+ "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (pshuflw:$src2
+ (bc_v8i16 (memopv2i64 addr:$src1)),
+ (undef))))]>,
+ XD, Requires<[HasSSE2]>;
+
+
+let Constraints = "$src1 = $dst" in {
+ def PUNPCKLBWrr : PDI<0x60, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpcklbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v16i8 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKLBWrm : PDI<0x60, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpcklbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1,
+ (bc_v16i8 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKLWDrr : PDI<0x61, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpcklwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v8i16 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKLWDrm : PDI<0x61, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpcklwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1,
+ (bc_v8i16 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKLDQrr : PDI<0x62, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4i32 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKLDQrm : PDI<0x62, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpcklqdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpcklqdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (unpckl VR128:$src1,
+ (memopv2i64 addr:$src2))))]>;
+
+ def PUNPCKHBWrr : PDI<0x68, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckhbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v16i8 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKHBWrm : PDI<0x68, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckhbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckh VR128:$src1,
+ (bc_v16i8 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKHWDrr : PDI<0x69, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckhwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v8i16 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKHWDrm : PDI<0x69, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckhwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckh VR128:$src1,
+ (bc_v8i16 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckhdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4i32 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckhdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckh VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckhqdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckhqdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (unpckh VR128:$src1,
+ (memopv2i64 addr:$src2))))]>;
+}
+
+// Extract / Insert
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+ (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2))]>;
+let Constraints = "$src1 = $dst" in {
+ def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1,
+ GR32:$src2, i32i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
+ def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ i16mem:$src2, i32i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+ imm:$src3))]>;
+}
+
+// Mask creation
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
+
+// Conditional store
+let Uses = [EDI] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
+
+let Uses = [RDI] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+
+// Non-temporal stores
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movnti\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
+ TB, Requires<[HasSSE2]>;
+
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+ "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+ TB, Requires<[HasSSE2]>;
+
+// Load, store, and memory fence
+def LFENCE : I<0xAE, MRM5r, (outs), (ins),
+ "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM6r, (outs), (ins),
+ "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+
+//TODO: custom lower this so as to never even generate the noop
+def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
+ (i8 0)), (NOOP)>;
+def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
+def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
+def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
+ (i8 1)), (MFENCE)>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1 in
+ def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins),
+ "pcmpeqd\t$dst, $dst",
+ [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+
+// FR64 to 128-bit vector conversion.
+let isAsCheapAsAMove = 1 in
+def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
+ "movsd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (scalar_to_vector FR64:$src)))]>;
+def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "movsd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
+
+def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))]>;
+def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+
+def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))]>;
+
+def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+ Requires<[HasSSE2]>;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+// (f32 FR32:$src)>;
+let isAsCheapAsAMove = 1 in
+def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins VR128:$src),
+ "movsd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (vector_extract (v2f64 VR128:$src),
+ (iPTR 0)))]>;
+def MOVPD2SDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movsd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+ (iPTR 0)))]>;
+def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (vector_extract (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))]>;
+def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let Constraints = "$src1 = $dst" in {
+ let neverHasSideEffects = 1 in
+ def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
+ "movsd\t{$src2, $dst|$dst, $src2}", []>;
+
+ let AddedComplexity = 15 in
+ def MOVLPDrr : SDI<0x10, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "movsd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (movl VR128:$src1, VR128:$src2)))]>;
+}
+
+// Store / copy lower 64-bits of a XMM register.
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in {
+def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "movsd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86vzmovl (v2f64 (scalar_to_vector
+ (loadf64 addr:$src))))))]>;
+
+def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (MOVZSD2PDrm addr:$src)>;
+def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (MOVZSD2PDrm addr:$src)>;
+def : Pat<(v2f64 (X86vzload addr:$src)), (MOVZSD2PDrm addr:$src)>;
+}
+
+// movd / movq to XMM register zero-extends
+let AddedComplexity = 15 in {
+def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector GR32:$src)))))]>;
+// This is X86-64 only.
+def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (v2i64 (scalar_to_vector GR64:$src)))))]>;
+}
+
+let AddedComplexity = 20 in {
+def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+ (loadi32 addr:$src))))))]>;
+
+def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+ (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (MOVZDI2PDIrm addr:$src)>;
+
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+ (loadi64 addr:$src))))))]>, XS,
+ Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+ (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+ XS, Requires<[HasSSE2]>;
+
+let AddedComplexity = 20 in {
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (loadv2i64 addr:$src))))]>,
+ XS, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+ (MOVZPQILo2PQIrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+// Move Instructions
+def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movshdup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (movshdup
+ VR128:$src, (undef))))]>;
+def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movshdup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (movshdup
+ (memopv4f32 addr:$src), (undef)))]>;
+
+def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movsldup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (movsldup
+ VR128:$src, (undef))))]>;
+def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movsldup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (movsldup
+ (memopv4f32 addr:$src), (undef)))]>;
+
+def MOVDDUPrr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movddup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>;
+def MOVDDUPrm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "movddup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)),
+ (undef))))]>;
+
+def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
+ (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+let AddedComplexity = 5 in {
+def : Pat<(movddup (memopv2f64 addr:$src), (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (memopv2i64 addr:$src), (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+}
+
+// Arithmetic
+let Constraints = "$src1 = $dst" in {
+ def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "addsubps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+ VR128:$src2))]>;
+ def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "addsubps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+ (memop addr:$src2)))]>;
+ def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "addsubpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+ VR128:$src2))]>;
+ def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "addsubpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+ (memop addr:$src2)))]>;
+}
+
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "lddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
+
+// Horizontal ops
+class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3DI<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3DI<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (memop addr:$src2))))]>;
+class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3I<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3I<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (memopv2f64 addr:$src2))))]>;
+
+let Constraints = "$src1 = $dst" in {
+ def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+ def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+ def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+ def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+ def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+ def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+ def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+ def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+}
+
+// Thread synchronization
+def MONITOR : I<0x01, MRM1r, (outs), (ins), "monitor",
+ [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
+def MWAIT : I<0x01, MRM1r, (outs), (ins), "mwait",
+ [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <1, 1, 3, 3>
+let AddedComplexity = 15 in
+def : Pat<(v4i32 (movshdup VR128:$src, (undef))),
+ (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+def : Pat<(v4i32 (movshdup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+ (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <0, 0, 2, 2>
+let AddedComplexity = 15 in
+ def : Pat<(v4i32 (movsldup VR128:$src, (undef))),
+ (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+ def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+ (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+//===----------------------------------------------------------------------===//
+// SSSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS3I_unop_rm_int_8 - Simple SSSE3 unary operator whose type is v*i8.
+multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64 (bitconvert (memopv8i8 addr:$src))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ OpSize;
+
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv16i8 addr:$src))))]>, OpSize;
+}
+
+/// SS3I_unop_rm_int_16 - Simple SSSE3 unary operator whose type is v*i16.
+multiclass SS3I_unop_rm_int_16<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64
+ (bitconvert (memopv4i16 addr:$src))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ OpSize;
+
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+}
+
+/// SS3I_unop_rm_int_32 - Simple SSSE3 unary operator whose type is v*i32.
+multiclass SS3I_unop_rm_int_32<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64
+ (bitconvert (memopv2i32 addr:$src))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ OpSize;
+
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv4i32 addr:$src))))]>, OpSize;
+}
+
+defm PABSB : SS3I_unop_rm_int_8 <0x1C, "pabsb",
+ int_x86_ssse3_pabs_b,
+ int_x86_ssse3_pabs_b_128>;
+defm PABSW : SS3I_unop_rm_int_16<0x1D, "pabsw",
+ int_x86_ssse3_pabs_w,
+ int_x86_ssse3_pabs_w_128>;
+defm PABSD : SS3I_unop_rm_int_32<0x1E, "pabsd",
+ int_x86_ssse3_pabs_d,
+ int_x86_ssse3_pabs_d_128>;
+
+/// SS3I_binop_rm_int_8 - Simple SSSE3 binary operator whose type is v*i8.
+let Constraints = "$src1 = $dst" in {
+ multiclass SS3I_binop_rm_int_8<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128,
+ bit Commutable = 0> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1,
+ (bitconvert (memopv8i8 addr:$src2))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+ }
+}
+
+/// SS3I_binop_rm_int_16 - Simple SSSE3 binary operator whose type is v*i16.
+let Constraints = "$src1 = $dst" in {
+ multiclass SS3I_binop_rm_int_16<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128,
+ bit Commutable = 0> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1,
+ (bitconvert (memopv4i16 addr:$src2))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv8i16 addr:$src2))))]>, OpSize;
+ }
+}
+
+/// SS3I_binop_rm_int_32 - Simple SSSE3 binary operator whose type is v*i32.
+let Constraints = "$src1 = $dst" in {
+ multiclass SS3I_binop_rm_int_32<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128,
+ bit Commutable = 0> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1,
+ (bitconvert (memopv2i32 addr:$src2))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv4i32 addr:$src2))))]>, OpSize;
+ }
+}
+
+defm PHADDW : SS3I_binop_rm_int_16<0x01, "phaddw",
+ int_x86_ssse3_phadd_w,
+ int_x86_ssse3_phadd_w_128>;
+defm PHADDD : SS3I_binop_rm_int_32<0x02, "phaddd",
+ int_x86_ssse3_phadd_d,
+ int_x86_ssse3_phadd_d_128>;
+defm PHADDSW : SS3I_binop_rm_int_16<0x03, "phaddsw",
+ int_x86_ssse3_phadd_sw,
+ int_x86_ssse3_phadd_sw_128>;
+defm PHSUBW : SS3I_binop_rm_int_16<0x05, "phsubw",
+ int_x86_ssse3_phsub_w,
+ int_x86_ssse3_phsub_w_128>;
+defm PHSUBD : SS3I_binop_rm_int_32<0x06, "phsubd",
+ int_x86_ssse3_phsub_d,
+ int_x86_ssse3_phsub_d_128>;
+defm PHSUBSW : SS3I_binop_rm_int_16<0x07, "phsubsw",
+ int_x86_ssse3_phsub_sw,
+ int_x86_ssse3_phsub_sw_128>;
+defm PMADDUBSW : SS3I_binop_rm_int_8 <0x04, "pmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw,
+ int_x86_ssse3_pmadd_ub_sw_128>;
+defm PMULHRSW : SS3I_binop_rm_int_16<0x0B, "pmulhrsw",
+ int_x86_ssse3_pmul_hr_sw,
+ int_x86_ssse3_pmul_hr_sw_128, 1>;
+defm PSHUFB : SS3I_binop_rm_int_8 <0x00, "pshufb",
+ int_x86_ssse3_pshuf_b,
+ int_x86_ssse3_pshuf_b_128>;
+defm PSIGNB : SS3I_binop_rm_int_8 <0x08, "psignb",
+ int_x86_ssse3_psign_b,
+ int_x86_ssse3_psign_b_128>;
+defm PSIGNW : SS3I_binop_rm_int_16<0x09, "psignw",
+ int_x86_ssse3_psign_w,
+ int_x86_ssse3_psign_w_128>;
+defm PSIGND : SS3I_binop_rm_int_32<0x0A, "psignd",
+ int_x86_ssse3_psign_d,
+ int_x86_ssse3_psign_d_128>;
+
+let Constraints = "$src1 = $dst" in {
+ def PALIGNR64rr : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2, i16imm:$src3),
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst,
+ (int_x86_ssse3_palign_r
+ VR64:$src1, VR64:$src2,
+ imm:$src3))]>;
+ def PALIGNR64rm : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2, i16imm:$src3),
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst,
+ (int_x86_ssse3_palign_r
+ VR64:$src1,
+ (bitconvert (memopv2i32 addr:$src2)),
+ imm:$src3))]>;
+
+ def PALIGNR128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i32imm:$src3),
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_ssse3_palign_r_128
+ VR128:$src1, VR128:$src2,
+ imm:$src3))]>, OpSize;
+ def PALIGNR128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i32imm:$src3),
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_ssse3_palign_r_128
+ VR128:$src1,
+ (bitconvert (memopv4i32 addr:$src2)),
+ imm:$src3))]>, OpSize;
+}
+
+def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+ (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
+def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+ (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// extload f32 -> f64. This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+let Predicates = [HasSSE2] in
+ def : Pat<(fextend (loadf32 addr:$src)),
+ (CVTSS2SDrm addr:$src)>;
+
+// bit_convert
+let Predicates = [HasSSE2] in {
+ def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
+def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+ (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+ (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+// Splat v2f64 / v2i64
+let AddedComplexity = 10 in {
+def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
+ (UNPCKLPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(unpckh (v2f64 VR128:$src), (undef)),
+ (UNPCKHPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
+ (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(unpckh (v2i64 VR128:$src), (undef)),
+ (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// Special unary SHUFPSrri case.
+def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
+ (SHUFPSrri VR128:$src1, VR128:$src1,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE1]>;
+let AddedComplexity = 5 in
+def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
+ (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[HasSSE2]>;
+// Special unary SHUFPDrri case.
+def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
+ (SHUFPDrri VR128:$src1, VR128:$src1,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+// Special unary SHUFPDrri case.
+def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
+ (SHUFPDrri VR128:$src1, VR128:$src1,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+// Unary v4f32 shuffle with PSHUF* in order to fold a load.
+def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
+ (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[HasSSE2]>;
+
+// Special binary v4i32 shuffle cases with SHUFPS.
+def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
+ (SHUFPSrri VR128:$src1, VR128:$src2,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
+ (SHUFPSrmi VR128:$src1, addr:$src2,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+// Special binary v2i64 shuffle cases using SHUFPDrri.
+def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
+ (SHUFPDrri VR128:$src1, VR128:$src2,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+
+// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (unpckl_undef:$src2 VR128:$src, (undef))),
+ (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (unpckl_undef:$src2 VR128:$src, (undef))),
+ (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[OptForSpeed, HasSSE2]>;
+}
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (unpckl_undef VR128:$src, (undef))),
+ (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v16i8 (unpckl_undef VR128:$src, (undef))),
+ (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (unpckl_undef VR128:$src, (undef))),
+ (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (unpckl_undef VR128:$src, (undef))),
+ (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (unpckh_undef:$src2 VR128:$src, (undef))),
+ (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (unpckh_undef:$src2 VR128:$src, (undef))),
+ (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[OptForSpeed, HasSSE2]>;
+}
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (unpckh_undef VR128:$src, (undef))),
+ (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v16i8 (unpckh_undef VR128:$src, (undef))),
+ (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (unpckh_undef VR128:$src, (undef))),
+ (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (unpckh_undef VR128:$src, (undef))),
+ (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+def : Pat<(v4i32 (movhp VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+ (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+ (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+ (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+// vector_shuffle v1, (load v2) <0, 1, 4, 5> using MOVHPS
+def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (movhp VR128:$src1, (load addr:$src2))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (movhp VR128:$src1, (load addr:$src2))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (movhp VR128:$src1, (load addr:$src2))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2i64 (movhp VR128:$src1, (load addr:$src2))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+}
+
+// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+// (store (vector_shuffle (load addr), v2, <0, 1, 4, 5>), addr) using MOVHPS
+def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4f32 (movhp (load addr:$src1), VR128:$src2)), addr:$src1),
+ (MOVHPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2f64 (movhp (load addr:$src1), VR128:$src2)), addr:$src1),
+ (MOVHPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+ addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 (movhp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+ addr:$src1),
+ (MOVHPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2i64 (movhp (load addr:$src1), VR128:$src2)), addr:$src1),
+ (MOVHPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+
+let AddedComplexity = 15 in {
+// Setting the lowest element in the vector.
+def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
+ (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
+ (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+// vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd)
+def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
+ (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
+ (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+}
+
+// Set lowest element and zero upper elements.
+let AddedComplexity = 15 in
+def : Pat<(v2f64 (movl immAllZerosV_bc, VR128:$src)),
+ (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
+
+// Some special case pandn patterns.
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+ VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+ VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+ VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+ (memop addr:$src2))),
+ (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+ (memop addr:$src2))),
+ (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+ (memop addr:$src2))),
+ (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// vector -> vector casts
+def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+ (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (Int_CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 (sint_to_fp (v2i32 VR64:$src))),
+ (Int_CVTPI2PDrr VR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
+ (Int_CVTTPD2PIrr VR128:$src)>, Requires<[HasSSE2]>;
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+def : Pat<(alignedloadv4i32 addr:$src),
+ (MOVAPSrm addr:$src)>, Requires<[HasSSE1]>;
+def : Pat<(loadv4i32 addr:$src),
+ (MOVUPSrm addr:$src)>, Requires<[HasSSE1]>;
+def : Pat<(alignedloadv2i64 addr:$src),
+ (MOVAPSrm addr:$src)>, Requires<[HasSSE2]>;
+def : Pat<(loadv2i64 addr:$src),
+ (MOVUPSrm addr:$src)>, Requires<[HasSSE2]>;
+
+def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
+ string OpcodeStr,
+ Intrinsic V4F32Int,
+ Intrinsic V2F64Int> {
+ // Intrinsic operation, reg.
+ // Vector intrinsic operation, reg
+ def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
+ OpSize;
+
+ // Vector intrinsic operation, mem
+ def PSm_Int : SS4AIi8<opcps, MRMSrcMem,
+ (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
+ OpSize;
+
+ // Vector intrinsic operation, reg
+ def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
+ OpSize;
+
+ // Vector intrinsic operation, mem
+ def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
+ (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
+ OpSize;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr,
+ Intrinsic F32Int,
+ Intrinsic F64Int> {
+ // Intrinsic operation, reg.
+ def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+ (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize;
+
+ // Intrinsic operation, mem.
+ def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+ (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+ OpSize;
+
+ // Intrinsic operation, reg.
+ def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+ (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize;
+
+ // Intrinsic operation, mem.
+ def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+ (outs VR128:$dst),
+ (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+ OpSize;
+}
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round",
+ int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+ int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128> {
+ def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
+ def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+}
+
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+ int_x86_sse41_phminposuw>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, bit Commutable = 0> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+ }
+}
+
+defm PCMPEQQ : SS41I_binop_rm_int<0x29, "pcmpeqq",
+ int_x86_sse41_pcmpeqq, 1>;
+defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw",
+ int_x86_sse41_packusdw, 0>;
+defm PMINSB : SS41I_binop_rm_int<0x38, "pminsb",
+ int_x86_sse41_pminsb, 1>;
+defm PMINSD : SS41I_binop_rm_int<0x39, "pminsd",
+ int_x86_sse41_pminsd, 1>;
+defm PMINUD : SS41I_binop_rm_int<0x3B, "pminud",
+ int_x86_sse41_pminud, 1>;
+defm PMINUW : SS41I_binop_rm_int<0x3A, "pminuw",
+ int_x86_sse41_pminuw, 1>;
+defm PMAXSB : SS41I_binop_rm_int<0x3C, "pmaxsb",
+ int_x86_sse41_pmaxsb, 1>;
+defm PMAXSD : SS41I_binop_rm_int<0x3D, "pmaxsd",
+ int_x86_sse41_pmaxsd, 1>;
+defm PMAXUD : SS41I_binop_rm_int<0x3F, "pmaxud",
+ int_x86_sse41_pmaxud, 1>;
+defm PMAXUW : SS41I_binop_rm_int<0x3E, "pmaxuw",
+ int_x86_sse41_pmaxuw, 1>;
+
+defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, 1>;
+
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
+ (PCMPEQQrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
+ (PCMPEQQrm VR128:$src1, addr:$src2)>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, ValueType OpVT,
+ SDNode OpNode, Intrinsic IntId128,
+ bit Commutable = 0> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode (OpVT VR128:$src1),
+ VR128:$src2))]>, OpSize {
+ let isCommutable = Commutable;
+ }
+ def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (OpNode VR128:$src1, (memop addr:$src2)))]>, OpSize;
+ def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1, (memop addr:$src2)))]>,
+ OpSize;
+ }
+}
+defm PMULLD : SS41I_binop_patint<0x40, "pmulld", v4i32, mul,
+ int_x86_sse41_pmulld, 1>;
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, bit Commutable = 0> {
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
+ OpSize;
+ }
+}
+
+defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps",
+ int_x86_sse41_blendps, 0>;
+defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd",
+ int_x86_sse41_blendpd, 0>;
+defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw",
+ int_x86_sse41_pblendw, 0>;
+defm DPPS : SS41I_binop_rmi_int<0x40, "dpps",
+ int_x86_sse41_dpps, 1>;
+defm DPPD : SS41I_binop_rmi_int<0x41, "dppd",
+ int_x86_sse41_dppd, 1>;
+defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw",
+ int_x86_sse41_mpsadbw, 1>;
+
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+ multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr,
+ "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+ OpSize;
+
+ def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+ [(set VR128:$dst,
+ (IntId VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
+ }
+}
+
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
+
+
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
+ OpSize;
+}
+
+defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
+defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
+defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
+defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
+defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
+defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
+
+// Common patterns involving scalar load.
+def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+ (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+ (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+ (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+ (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+ (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+ (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+ (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+ (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+ (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+ (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+ (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+ (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+ OpSize;
+}
+
+defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
+defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
+defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
+defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
+
+// Common patterns involving scalar load
+def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+ (PMOVSXBDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+ (PMOVSXWQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+ (PMOVZXBDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+ (PMOVZXWQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+ // Expecting a i16 load any extended to i32 value.
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId (bitconvert
+ (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
+ OpSize;
+}
+
+defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
+defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>;
+
+// Common patterns involving scalar load
+def : Pat<(int_x86_sse41_pmovsxbq
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVSXBQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbq
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVZXBQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
+ OpSize;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst,
+ (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+ addr:$dst)]>, OpSize;
+}
+
+defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
+
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst,
+ (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+ OpSize;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+ addr:$dst)]>, OpSize;
+}
+
+defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+ imm:$src2))),
+ addr:$dst),
+ (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[HasSSE41]>;
+
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+ imm:$src3))]>, OpSize;
+ }
+}
+
+defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
+
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+ OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+ imm:$src3)))]>, OpSize;
+ }
+}
+
+defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, FR32:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86insrtps VR128:$src1, FR32:$src2, imm:$src3))]>, OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86insrtps VR128:$src1, (loadf32 addr:$src2),
+ imm:$src3))]>, OpSize;
+ }
+}
+
+defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ptest \t{$src2, $src1|$src1, $src2}", []>, OpSize;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
+ "ptest \t{$src2, $src1|$src1, $src2}", []>, OpSize;
+}
+
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+
+/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
+let Constraints = "$src1 = $dst" in {
+ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, bit Commutable = 0> {
+ def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+ }
+}
+
+defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
+
+def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
+ (PCMPGTQrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
+ (PCMPGTQrm VR128:$src1, addr:$src2)>;
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 0000000..f923106
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -0,0 +1,560 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "llvm/Function.h"
+#include "llvm/Config/alloca.h"
+#include "llvm/Support/Compiler.h"
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+// Determine the platform we're running on
+#if defined (__x86_64__) || defined (_M_AMD64)
+# define X86_64_JIT
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+# define X86_32_JIT
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+ unsigned char *OldByte = (unsigned char *)Old;
+ *OldByte++ = 0xE9; // Emit JMP opcode.
+ unsigned *OldWord = (unsigned *)OldByte;
+ unsigned NewAddr = (intptr_t)New;
+ unsigned OldAddr = (intptr_t)OldWord;
+ *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host. This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// Check if building with -fPIC
+#if defined(__PIC__) && __PIC__ && defined(__linux__)
+#define ASMCALLSUFFIX "@PLT"
+#else
+#define ASMCALLSUFFIX
+#endif
+
+// For ELF targets, use a .size and .type directive, to let tools
+// know the extent of functions defined in assembler.
+#if defined(__ELF__)
+# define SIZE(sym) ".size " #sym ", . - " #sym "\n"
+# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n"
+#else
+# define SIZE(sym)
+# define TYPE_FUNCTION(sym)
+#endif
+
+// Provide a convenient way for disabling usage of CFI directives.
+// This is needed for old/broken assemblers (for example, gas on
+// Darwin is pretty old and doesn't support these directives)
+#if defined(__APPLE__)
+# define CFI(x)
+#else
+// FIXME: Disable this until we really want to use it. Also, we will
+// need to add some workarounds for compilers, which support
+// only subset of these directives.
+# define CFI(x)
+#endif
+
+// Provide a wrapper for X86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(X86_64_JIT)
+# ifndef _MSC_VER
+ // No need to save EAX/EDX for X86-64.
+ void X86CompilationCallback(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback\n"
+ TYPE_FUNCTION(X86CompilationCallback)
+ ASMPREFIX "X86CompilationCallback:\n"
+ CFI(".cfi_startproc\n")
+ // Save RBP
+ "pushq %rbp\n"
+ CFI(".cfi_def_cfa_offset 16\n")
+ CFI(".cfi_offset %rbp, -16\n")
+ // Save RSP
+ "movq %rsp, %rbp\n"
+ CFI(".cfi_def_cfa_register %rbp\n")
+ // Save all int arg registers
+ "pushq %rdi\n"
+ CFI(".cfi_rel_offset %rdi, 0\n")
+ "pushq %rsi\n"
+ CFI(".cfi_rel_offset %rsi, 8\n")
+ "pushq %rdx\n"
+ CFI(".cfi_rel_offset %rdx, 16\n")
+ "pushq %rcx\n"
+ CFI(".cfi_rel_offset %rcx, 24\n")
+ "pushq %r8\n"
+ CFI(".cfi_rel_offset %r8, 32\n")
+ "pushq %r9\n"
+ CFI(".cfi_rel_offset %r9, 40\n")
+ // Align stack on 16-byte boundary. ESP might not be properly aligned
+ // (8 byte) if this is called from an indirect stub.
+ "andq $-16, %rsp\n"
+ // Save all XMM arg registers
+ "subq $128, %rsp\n"
+ "movaps %xmm0, (%rsp)\n"
+ "movaps %xmm1, 16(%rsp)\n"
+ "movaps %xmm2, 32(%rsp)\n"
+ "movaps %xmm3, 48(%rsp)\n"
+ "movaps %xmm4, 64(%rsp)\n"
+ "movaps %xmm5, 80(%rsp)\n"
+ "movaps %xmm6, 96(%rsp)\n"
+ "movaps %xmm7, 112(%rsp)\n"
+ // JIT callee
+ "movq %rbp, %rdi\n" // Pass prev frame and return address
+ "movq 8(%rbp), %rsi\n"
+ "call " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
+ // Restore all XMM arg registers
+ "movaps 112(%rsp), %xmm7\n"
+ "movaps 96(%rsp), %xmm6\n"
+ "movaps 80(%rsp), %xmm5\n"
+ "movaps 64(%rsp), %xmm4\n"
+ "movaps 48(%rsp), %xmm3\n"
+ "movaps 32(%rsp), %xmm2\n"
+ "movaps 16(%rsp), %xmm1\n"
+ "movaps (%rsp), %xmm0\n"
+ // Restore RSP
+ "movq %rbp, %rsp\n"
+ CFI(".cfi_def_cfa_register %rsp\n")
+ // Restore all int arg registers
+ "subq $48, %rsp\n"
+ CFI(".cfi_adjust_cfa_offset 48\n")
+ "popq %r9\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %r9\n")
+ "popq %r8\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %r8\n")
+ "popq %rcx\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rcx\n")
+ "popq %rdx\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rdx\n")
+ "popq %rsi\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rsi\n")
+ "popq %rdi\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rdi\n")
+ // Restore RBP
+ "popq %rbp\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rbp\n")
+ "ret\n"
+ CFI(".cfi_endproc\n")
+ SIZE(X86CompilationCallback)
+ );
+# else
+ // No inline assembler support on this platform. The routine is in external
+ // file.
+ void X86CompilationCallback();
+
+# endif
+#elif defined (X86_32_JIT)
+# ifndef _MSC_VER
+ void X86CompilationCallback(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback\n"
+ TYPE_FUNCTION(X86CompilationCallback)
+ ASMPREFIX "X86CompilationCallback:\n"
+ CFI(".cfi_startproc\n")
+ "pushl %ebp\n"
+ CFI(".cfi_def_cfa_offset 8\n")
+ CFI(".cfi_offset %ebp, -8\n")
+ "movl %esp, %ebp\n" // Standard prologue
+ CFI(".cfi_def_cfa_register %ebp\n")
+ "pushl %eax\n"
+ CFI(".cfi_rel_offset %eax, 0\n")
+ "pushl %edx\n" // Save EAX/EDX/ECX
+ CFI(".cfi_rel_offset %edx, 4\n")
+ "pushl %ecx\n"
+ CFI(".cfi_rel_offset %ecx, 8\n")
+# if defined(__APPLE__)
+ "andl $-16, %esp\n" // Align ESP on 16-byte boundary
+# endif
+ "subl $16, %esp\n"
+ "movl 4(%ebp), %eax\n" // Pass prev frame and return address
+ "movl %eax, 4(%esp)\n"
+ "movl %ebp, (%esp)\n"
+ "call " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
+ "movl %ebp, %esp\n" // Restore ESP
+ CFI(".cfi_def_cfa_register %esp\n")
+ "subl $12, %esp\n"
+ CFI(".cfi_adjust_cfa_offset 12\n")
+ "popl %ecx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ecx\n")
+ "popl %edx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %edx\n")
+ "popl %eax\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %eax\n")
+ "popl %ebp\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ebp\n")
+ "ret\n"
+ CFI(".cfi_endproc\n")
+ SIZE(X86CompilationCallback)
+ );
+
+ // Same as X86CompilationCallback but also saves XMM argument registers.
+ void X86CompilationCallback_SSE(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback_SSE\n"
+ TYPE_FUNCTION(X86CompilationCallback_SSE)
+ ASMPREFIX "X86CompilationCallback_SSE:\n"
+ CFI(".cfi_startproc\n")
+ "pushl %ebp\n"
+ CFI(".cfi_def_cfa_offset 8\n")
+ CFI(".cfi_offset %ebp, -8\n")
+ "movl %esp, %ebp\n" // Standard prologue
+ CFI(".cfi_def_cfa_register %ebp\n")
+ "pushl %eax\n"
+ CFI(".cfi_rel_offset %eax, 0\n")
+ "pushl %edx\n" // Save EAX/EDX/ECX
+ CFI(".cfi_rel_offset %edx, 4\n")
+ "pushl %ecx\n"
+ CFI(".cfi_rel_offset %ecx, 8\n")
+ "andl $-16, %esp\n" // Align ESP on 16-byte boundary
+ // Save all XMM arg registers
+ "subl $64, %esp\n"
+ // FIXME: provide frame move information for xmm registers.
+ // This can be tricky, because CFA register is ebp (unaligned)
+ // and we need to produce offsets relative to it.
+ "movaps %xmm0, (%esp)\n"
+ "movaps %xmm1, 16(%esp)\n"
+ "movaps %xmm2, 32(%esp)\n"
+ "movaps %xmm3, 48(%esp)\n"
+ "subl $16, %esp\n"
+ "movl 4(%ebp), %eax\n" // Pass prev frame and return address
+ "movl %eax, 4(%esp)\n"
+ "movl %ebp, (%esp)\n"
+ "call " ASMPREFIX "X86CompilationCallback2" ASMCALLSUFFIX "\n"
+ "addl $16, %esp\n"
+ "movaps 48(%esp), %xmm3\n"
+ CFI(".cfi_restore %xmm3\n")
+ "movaps 32(%esp), %xmm2\n"
+ CFI(".cfi_restore %xmm2\n")
+ "movaps 16(%esp), %xmm1\n"
+ CFI(".cfi_restore %xmm1\n")
+ "movaps (%esp), %xmm0\n"
+ CFI(".cfi_restore %xmm0\n")
+ "movl %ebp, %esp\n" // Restore ESP
+ CFI(".cfi_def_cfa_register esp\n")
+ "subl $12, %esp\n"
+ CFI(".cfi_adjust_cfa_offset 12\n")
+ "popl %ecx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ecx\n")
+ "popl %edx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %edx\n")
+ "popl %eax\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %eax\n")
+ "popl %ebp\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ebp\n")
+ "ret\n"
+ CFI(".cfi_endproc\n")
+ SIZE(X86CompilationCallback_SSE)
+ );
+# else
+ void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+
+ _declspec(naked) void X86CompilationCallback(void) {
+ __asm {
+ push ebp
+ mov ebp, esp
+ push eax
+ push edx
+ push ecx
+ and esp, -16
+ mov eax, dword ptr [ebp+4]
+ mov dword ptr [esp+4], eax
+ mov dword ptr [esp], ebp
+ call X86CompilationCallback2
+ mov esp, ebp
+ sub esp, 12
+ pop ecx
+ pop edx
+ pop eax
+ pop ebp
+ ret
+ }
+ }
+
+# endif // _MSC_VER
+
+#else // Not an i386 host
+ void X86CompilationCallback() {
+ assert(0 && "Cannot call X86CompilationCallback() on a non-x86 arch!\n");
+ abort();
+ }
+#endif
+}
+
+/// X86CompilationCallback2 - This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call. This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+extern "C" void ATTRIBUTE_USED
+X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+ intptr_t *RetAddrLoc = &StackPtr[1];
+ assert(*RetAddrLoc == RetAddr &&
+ "Could not find return address on the stack!");
+
+ // It's a stub if there is an interrupt marker after the call.
+ bool isStub = ((unsigned char*)RetAddr)[0] == 0xCD;
+
+ // The call instruction should have pushed the return value onto the stack...
+#if defined (X86_64_JIT)
+ RetAddr--; // Backtrack to the reference itself...
+#else
+ RetAddr -= 4; // Backtrack to the reference itself...
+#endif
+
+#if 0
+ DOUT << "In callback! Addr=" << (void*)RetAddr
+ << " ESP=" << (void*)StackPtr
+ << ": Resolving call to function: "
+ << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n";
+#endif
+
+ // Sanity check to make sure this really is a call instruction.
+#if defined (X86_64_JIT)
+ assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+ assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+ assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+ intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+ // Rewrite the call target... so that we don't end up here every time we
+ // execute the call.
+#if defined (X86_64_JIT)
+ if (!isStub)
+ *(intptr_t *)(RetAddr - 0xa) = NewVal;
+#else
+ *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+ if (isStub) {
+ // If this is a stub, rewrite the call into an unconditional branch
+ // instruction so that two return addresses are not pushed onto the stack
+ // when the requested function finally gets called. This also makes the
+ // 0xCD byte (interrupt) dead, so the marker doesn't effect anything.
+#if defined (X86_64_JIT)
+ // If the target address is within 32-bit range of the stub, use a
+ // PC-relative branch instead of loading the actual address. (This is
+ // considerably shorter than the 64-bit immediate load already there.)
+ // We assume here intptr_t is 64 bits.
+ intptr_t diff = NewVal-RetAddr+7;
+ if (diff >= -2147483648LL && diff <= 2147483647LL) {
+ *(unsigned char*)(RetAddr-0xc) = 0xE9;
+ *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff;
+ } else {
+ *(intptr_t *)(RetAddr - 0xa) = NewVal;
+ ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+ }
+#else
+ ((unsigned char*)RetAddr)[-1] = 0xE9;
+#endif
+ }
+
+ // Change the return address to reexecute the call instruction...
+#if defined (X86_64_JIT)
+ *RetAddrLoc -= 0xd;
+#else
+ *RetAddrLoc -= 5;
+#endif
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+ JITCompilerFunction = F;
+
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+ unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+ union {
+ unsigned u[3];
+ char c[12];
+ } text;
+
+ if (!X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) {
+ // FIXME: support for AMD family of processors.
+ if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+ X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+ if ((EDX >> 25) & 0x1)
+ return X86CompilationCallback_SSE;
+ }
+ }
+#endif
+
+ return X86CompilationCallback;
+}
+
+void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+ JITCodeEmitter &JCE) {
+#if defined (X86_64_JIT)
+ JCE.startGVStub(GV, 8, 8);
+ JCE.emitWordLE((unsigned)(intptr_t)ptr);
+ JCE.emitWordLE((unsigned)(((intptr_t)ptr) >> 32));
+#else
+ JCE.startGVStub(GV, 4, 4);
+ JCE.emitWordLE((intptr_t)ptr);
+#endif
+ return JCE.finishGVStub(GV);
+}
+
+void *X86JITInfo::emitFunctionStub(const Function* F, void *Fn,
+ JITCodeEmitter &JCE) {
+ // Note, we cast to intptr_t here to silence a -pedantic warning that
+ // complains about casting a function pointer to a normal pointer.
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+ bool NotCC = (Fn != (void*)(intptr_t)X86CompilationCallback &&
+ Fn != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+ bool NotCC = Fn != (void*)(intptr_t)X86CompilationCallback;
+#endif
+ if (NotCC) {
+#if defined (X86_64_JIT)
+ JCE.startGVStub(F, 13, 4);
+ JCE.emitByte(0x49); // REX prefix
+ JCE.emitByte(0xB8+2); // movabsq r10
+ JCE.emitWordLE((unsigned)(intptr_t)Fn);
+ JCE.emitWordLE((unsigned)(((intptr_t)Fn) >> 32));
+ JCE.emitByte(0x41); // REX prefix
+ JCE.emitByte(0xFF); // jmpq *r10
+ JCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+ JCE.startGVStub(F, 5, 4);
+ JCE.emitByte(0xE9);
+ JCE.emitWordLE((intptr_t)Fn-JCE.getCurrentPCValue()-4);
+#endif
+ return JCE.finishGVStub(F);
+ }
+
+#if defined (X86_64_JIT)
+ JCE.startGVStub(F, 14, 4);
+ JCE.emitByte(0x49); // REX prefix
+ JCE.emitByte(0xB8+2); // movabsq r10
+ JCE.emitWordLE((unsigned)(intptr_t)Fn);
+ JCE.emitWordLE((unsigned)(((intptr_t)Fn) >> 32));
+ JCE.emitByte(0x41); // REX prefix
+ JCE.emitByte(0xFF); // callq *r10
+ JCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+ JCE.startGVStub(F, 6, 4);
+ JCE.emitByte(0xE8); // Call with 32 bit pc-rel destination...
+
+ JCE.emitWordLE((intptr_t)Fn-JCE.getCurrentPCValue()-4);
+#endif
+
+ JCE.emitByte(0xCD); // Interrupt - Just a marker identifying the stub!
+ return JCE.finishGVStub(F);
+}
+
+void X86JITInfo::emitFunctionStubAtAddr(const Function* F, void *Fn, void *Stub,
+ JITCodeEmitter &JCE) {
+ // Note, we cast to intptr_t here to silence a -pedantic warning that
+ // complains about casting a function pointer to a normal pointer.
+ JCE.startGVStub(F, Stub, 5);
+ JCE.emitByte(0xE9);
+#if defined (X86_64_JIT)
+ assert(((((intptr_t)Fn-JCE.getCurrentPCValue()-5) << 32) >> 32) ==
+ ((intptr_t)Fn-JCE.getCurrentPCValue()-5)
+ && "PIC displacement does not fit in displacement field!");
+#endif
+ JCE.emitWordLE((intptr_t)Fn-JCE.getCurrentPCValue()-4);
+ JCE.finishGVStub(F);
+}
+
+/// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+/// specific basic block.
+uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
+#if defined(X86_64_JIT)
+ return BB - Entry;
+#else
+ return BB - PICBase;
+#endif
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+ unsigned NumRelocs, unsigned char* GOTBase) {
+ for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+ void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+ intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+ switch ((X86::RelocationType)MR->getRelocationType()) {
+ case X86::reloc_pcrel_word: {
+ // PC relative relocation, add the relocated value to the value already in
+ // memory, after we adjust it for where the PC is.
+ ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
+ *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+ break;
+ }
+ case X86::reloc_picrel_word: {
+ // PIC base relative relocation, add the relocated value to the value
+ // already in memory, after we adjust it for where the PIC base is.
+ ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
+ *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+ break;
+ }
+ case X86::reloc_absolute_word:
+ // Absolute relocation, just add the relocated value to the value already
+ // in memory.
+ *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+ break;
+ case X86::reloc_absolute_dword:
+ *((intptr_t*)RelocPos) += ResultPtr;
+ break;
+ }
+ }
+}
+
+char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
+#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER)
+ TLSOffset -= size;
+ return TLSOffset;
+#else
+ assert(0 && "Cannot allocate thread local storage on this arch!\n");
+ return 0;
+#endif
+}
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 0000000..6a4e214
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.h
@@ -0,0 +1,84 @@
+//===- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+ class X86TargetMachine;
+
+ class X86JITInfo : public TargetJITInfo {
+ X86TargetMachine &TM;
+ uintptr_t PICBase;
+ char* TLSOffset;
+ public:
+ explicit X86JITInfo(X86TargetMachine &tm) : TM(tm) {
+ useGOT = 0;
+ TLSOffset = 0;
+ }
+
+ /// replaceMachineCodeForFunction - Make it so that calling the function
+ /// whose machine code is at OLD turns into a call to NEW, perhaps by
+ /// overwriting OLD with a branch to NEW. This is used for self-modifying
+ /// code.
+ ///
+ virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+ /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+ /// to emit an indirect symbol which contains the address of the specified
+ /// ptr.
+ virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+ JITCodeEmitter &JCE);
+
+ /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+ /// small native function that simply calls the function at the specified
+ /// address.
+ virtual void *emitFunctionStub(const Function* F, void *Fn,
+ JITCodeEmitter &JCE);
+
+ /// emitFunctionStubAtAddr - Use the specified JITCodeEmitter object to
+ /// emit a small native function that simply calls Fn. Emit the stub into
+ /// the supplied buffer.
+ virtual void emitFunctionStubAtAddr(const Function* F, void *Fn,
+ void *Buffer, JITCodeEmitter &JCE);
+
+ /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+ /// specific basic block.
+ virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase);
+
+ /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+ virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+ /// relocate - Before the JIT can run a block of code that has been emitted,
+ /// it must rewrite the code to contain the actual addresses of any
+ /// referenced global symbols.
+ virtual void relocate(void *Function, MachineRelocation *MR,
+ unsigned NumRelocs, unsigned char* GOTBase);
+
+ /// allocateThreadLocalMemory - Each target has its own way of
+ /// handling thread local variables. This method returns a value only
+ /// meaningful to the target.
+ virtual char* allocateThreadLocalMemory(size_t size);
+
+ /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute
+ /// PIC jumptable entry.
+ void setPICBase(uintptr_t Base) { PICBase = Base; }
+ uintptr_t getPICBase() const { return PICBase; }
+ };
+}
+
+#endif
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..8a5ac2c
--- /dev/null
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,112 @@
+//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+enum NameDecorationStyle {
+ None,
+ StdCall,
+ FastCall
+};
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+ /// ForceFramePointer - True if the function is required to use of frame
+ /// pointer for reasons other than it containing dynamic allocation or
+ /// that FP eliminatation is turned off. For example, Cygwin main function
+ /// contains stack pointer re-alignment code which requires FP.
+ bool ForceFramePointer;
+
+ /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize;
+
+ /// BytesToPopOnReturn - Number of bytes function pops on return.
+ /// Used on windows platform for stdcall & fastcall name decoration
+ unsigned BytesToPopOnReturn;
+
+ /// DecorationStyle - If the function requires additional name decoration,
+ /// DecorationStyle holds the right way to do so.
+ NameDecorationStyle DecorationStyle;
+
+ /// ReturnAddrIndex - FrameIndex for return slot.
+ int ReturnAddrIndex;
+
+ /// TailCallReturnAddrDelta - Delta the ReturnAddr stack slot is moved
+ /// Used for creating an area before the register spill area on the stack
+ /// the returnaddr can be savely move to this area
+ int TailCallReturnAddrDelta;
+
+ /// SRetReturnReg - Some subtargets require that sret lowering includes
+ /// returning the value of the returned struct in a register. This field
+ /// holds the virtual register into which the sret argument is passed.
+ unsigned SRetReturnReg;
+
+ /// GlobalBaseReg - keeps track of the virtual register initialized for
+ /// use as the global base register. This is used for PIC in some PIC
+ /// relocation models.
+ unsigned GlobalBaseReg;
+
+public:
+ X86MachineFunctionInfo() : ForceFramePointer(false),
+ CalleeSavedFrameSize(0),
+ BytesToPopOnReturn(0),
+ DecorationStyle(None),
+ ReturnAddrIndex(0),
+ TailCallReturnAddrDelta(0),
+ SRetReturnReg(0),
+ GlobalBaseReg(0) {}
+
+ X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false),
+ CalleeSavedFrameSize(0),
+ BytesToPopOnReturn(0),
+ DecorationStyle(None),
+ ReturnAddrIndex(0),
+ TailCallReturnAddrDelta(0),
+ SRetReturnReg(0),
+ GlobalBaseReg(0) {}
+
+ bool getForceFramePointer() const { return ForceFramePointer;}
+ void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+ void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+ NameDecorationStyle getDecorationStyle() const { return DecorationStyle; }
+ void setDecorationStyle(NameDecorationStyle style) { DecorationStyle = style;}
+
+ int getRAIndex() const { return ReturnAddrIndex; }
+ void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+ int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+ void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+ void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..5af1fb1
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,1280 @@
+//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+ const TargetInstrInfo &tii)
+ : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit() ?
+ X86::ADJCALLSTACKDOWN64 :
+ X86::ADJCALLSTACKDOWN32,
+ tm.getSubtarget<X86Subtarget>().is64Bit() ?
+ X86::ADJCALLSTACKUP64 :
+ X86::ADJCALLSTACKUP32),
+ TM(tm), TII(tii) {
+ // Cache some information.
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+ Is64Bit = Subtarget->is64Bit();
+ IsWin64 = Subtarget->isTargetWin64();
+ StackAlign = TM.getFrameInfo()->getStackAlignment();
+ if (Is64Bit) {
+ SlotSize = 8;
+ StackPtr = X86::RSP;
+ FramePtr = X86::RBP;
+ } else {
+ SlotSize = 4;
+ StackPtr = X86::ESP;
+ FramePtr = X86::EBP;
+ }
+}
+
+// getDwarfRegNum - This function maps LLVM register identifiers to the
+// Dwarf specific numbering, used in debug info and exception tables.
+
+int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+ unsigned Flavour = DWARFFlavour::X86_64;
+ if (!Subtarget->is64Bit()) {
+ if (Subtarget->isTargetDarwin()) {
+ if (isEH)
+ Flavour = DWARFFlavour::X86_32_DarwinEH;
+ else
+ Flavour = DWARFFlavour::X86_32_Generic;
+ } else if (Subtarget->isTargetCygMing()) {
+ // Unsupported by now, just quick fallback
+ Flavour = DWARFFlavour::X86_32_Generic;
+ } else {
+ Flavour = DWARFFlavour::X86_32_Generic;
+ }
+ }
+
+ return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour);
+}
+
+// getX86RegNum - This function maps LLVM register identifiers to their X86
+// specific numbering, which is used in various places encoding instructions.
+//
+unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
+ switch(RegNo) {
+ case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+ case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+ case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+ case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+ case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+ return N86::ESP;
+ case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+ return N86::EBP;
+ case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+ return N86::ESI;
+ case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+ return N86::EDI;
+
+ case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B:
+ return N86::EAX;
+ case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B:
+ return N86::ECX;
+ case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+ return N86::EDX;
+ case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+ return N86::EBX;
+ case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+ return N86::ESP;
+ case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+ return N86::EBP;
+ case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+ return N86::ESI;
+ case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+ return N86::EDI;
+
+ case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
+ case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
+ return RegNo-X86::ST0;
+
+ case X86::XMM0: case X86::XMM8: case X86::MM0:
+ return 0;
+ case X86::XMM1: case X86::XMM9: case X86::MM1:
+ return 1;
+ case X86::XMM2: case X86::XMM10: case X86::MM2:
+ return 2;
+ case X86::XMM3: case X86::XMM11: case X86::MM3:
+ return 3;
+ case X86::XMM4: case X86::XMM12: case X86::MM4:
+ return 4;
+ case X86::XMM5: case X86::XMM13: case X86::MM5:
+ return 5;
+ case X86::XMM6: case X86::XMM14: case X86::MM6:
+ return 6;
+ case X86::XMM7: case X86::XMM15: case X86::MM7:
+ return 7;
+
+ default:
+ assert(isVirtualRegister(RegNo) && "Unknown physical register!");
+ assert(0 && "Register allocator hasn't allocated reg correctly yet!");
+ return 0;
+ }
+}
+
+const TargetRegisterClass *X86RegisterInfo::getPointerRegClass() const {
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+ if (Subtarget->is64Bit())
+ return &X86::GR64RegClass;
+ else
+ return &X86::GR32RegClass;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &X86::CCRRegClass) {
+ if (Is64Bit)
+ return &X86::GR64RegClass;
+ else
+ return &X86::GR32RegClass;
+ }
+ return NULL;
+}
+
+const unsigned *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ bool callsEHReturn = false;
+
+ if (MF) {
+ const MachineFrameInfo *MFI = MF->getFrameInfo();
+ const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+ callsEHReturn = (MMI ? MMI->callsEHReturn() : false);
+ }
+
+ static const unsigned CalleeSavedRegs32Bit[] = {
+ X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0
+ };
+
+ static const unsigned CalleeSavedRegs32EHRet[] = {
+ X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0
+ };
+
+ static const unsigned CalleeSavedRegs64Bit[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+
+ static const unsigned CalleeSavedRegs64EHRet[] = {
+ X86::RAX, X86::RDX, X86::RBX, X86::R12,
+ X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+
+ static const unsigned CalleeSavedRegsWin64[] = {
+ X86::RBX, X86::RBP, X86::RDI, X86::RSI,
+ X86::R12, X86::R13, X86::R14, X86::R15,
+ X86::XMM6, X86::XMM7, X86::XMM8, X86::XMM9,
+ X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13,
+ X86::XMM14, X86::XMM15, 0
+ };
+
+ if (Is64Bit) {
+ if (IsWin64)
+ return CalleeSavedRegsWin64;
+ else
+ return (callsEHReturn ? CalleeSavedRegs64EHRet : CalleeSavedRegs64Bit);
+ } else {
+ return (callsEHReturn ? CalleeSavedRegs32EHRet : CalleeSavedRegs32Bit);
+ }
+}
+
+const TargetRegisterClass* const*
+X86RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+ bool callsEHReturn = false;
+
+ if (MF) {
+ const MachineFrameInfo *MFI = MF->getFrameInfo();
+ const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+ callsEHReturn = (MMI ? MMI->callsEHReturn() : false);
+ }
+
+ static const TargetRegisterClass * const CalleeSavedRegClasses32Bit[] = {
+ &X86::GR32RegClass, &X86::GR32RegClass,
+ &X86::GR32RegClass, &X86::GR32RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClasses32EHRet[] = {
+ &X86::GR32RegClass, &X86::GR32RegClass,
+ &X86::GR32RegClass, &X86::GR32RegClass,
+ &X86::GR32RegClass, &X86::GR32RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClasses64Bit[] = {
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClasses64EHRet[] = {
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClassesWin64[] = {
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass, 0
+ };
+
+ if (Is64Bit) {
+ if (IsWin64)
+ return CalleeSavedRegClassesWin64;
+ else
+ return (callsEHReturn ?
+ CalleeSavedRegClasses64EHRet : CalleeSavedRegClasses64Bit);
+ } else {
+ return (callsEHReturn ?
+ CalleeSavedRegClasses32EHRet : CalleeSavedRegClasses32Bit);
+ }
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ // Set the stack-pointer register and its aliases as reserved.
+ Reserved.set(X86::RSP);
+ Reserved.set(X86::ESP);
+ Reserved.set(X86::SP);
+ Reserved.set(X86::SPL);
+ // Set the frame-pointer register and its aliases as reserved if needed.
+ if (hasFP(MF)) {
+ Reserved.set(X86::RBP);
+ Reserved.set(X86::EBP);
+ Reserved.set(X86::BP);
+ Reserved.set(X86::BPL);
+ }
+ // Mark the x87 stack registers as reserved, since they don't
+ // behave normally with respect to liveness. We don't fully
+ // model the effects of x87 stack pushes and pops after
+ // stackification.
+ Reserved.set(X86::ST0);
+ Reserved.set(X86::ST1);
+ Reserved.set(X86::ST2);
+ Reserved.set(X86::ST3);
+ Reserved.set(X86::ST4);
+ Reserved.set(X86::ST5);
+ Reserved.set(X86::ST6);
+ Reserved.set(X86::ST7);
+ return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+static unsigned calculateMaxStackAlignment(const MachineFrameInfo *FFI) {
+ unsigned MaxAlign = 0;
+ for (int i = FFI->getObjectIndexBegin(),
+ e = FFI->getObjectIndexEnd(); i != e; ++i) {
+ if (FFI->isDeadObjectIndex(i))
+ continue;
+ unsigned Align = FFI->getObjectAlignment(i);
+ MaxAlign = std::max(MaxAlign, Align);
+ }
+
+ return MaxAlign;
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register. This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool X86RegisterInfo::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+
+ return (NoFramePointerElim ||
+ needsStackRealignment(MF) ||
+ MFI->hasVarSizedObjects() ||
+ MFI->isFrameAddressTaken() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+ (MMI && MMI->callsUnwindInit()));
+}
+
+bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ // FIXME: Currently we don't support stack realignment for functions with
+ // variable-sized allocas
+ return (RealignStack &&
+ (MFI->getMaxAlignment() > StackAlign &&
+ !MFI->hasVarSizedObjects()));
+}
+
+bool X86RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+ return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+int
+X86RegisterInfo::getFrameIndexOffset(MachineFunction &MF, int FI) const {
+ int Offset = MF.getFrameInfo()->getObjectOffset(FI) + SlotSize;
+ uint64_t StackSize = MF.getFrameInfo()->getStackSize();
+
+ if (needsStackRealignment(MF)) {
+ if (FI < 0)
+ // Skip the saved EBP
+ Offset += SlotSize;
+ else {
+ unsigned Align = MF.getFrameInfo()->getObjectAlignment(FI);
+ assert( (-(Offset + StackSize)) % Align == 0);
+ Align = 0;
+ return Offset + StackSize;
+ }
+
+ // FIXME: Support tail calls
+ } else {
+ if (!hasFP(MF))
+ return Offset + StackSize;
+
+ // Skip the saved EBP
+ Offset += SlotSize;
+
+ // Skip the RETADDR move area
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta;
+ }
+
+ return Offset;
+}
+
+void X86RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ if (!hasReservedCallFrame(MF)) {
+ // If the stack pointer can be changed after prologue, turn the
+ // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+ // adjcallstackdown instruction into 'add ESP, <amt>'
+ // TODO: consider using push / pop instead of sub + store / add
+ MachineInstr *Old = I;
+ uint64_t Amount = Old->getOperand(0).getImm();
+ if (Amount != 0) {
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+
+ MachineInstr *New = 0;
+ if (Old->getOpcode() == getCallFrameSetupOpcode()) {
+ New = BuildMI(MF, Old->getDebugLoc(),
+ TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri),
+ StackPtr).addReg(StackPtr).addImm(Amount);
+ } else {
+ assert(Old->getOpcode() == getCallFrameDestroyOpcode());
+ // factor out the amount the callee already popped.
+ uint64_t CalleeAmt = Old->getOperand(1).getImm();
+ Amount -= CalleeAmt;
+ if (Amount) {
+ unsigned Opc = (Amount < 128) ?
+ (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+ (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri);
+ New = BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), StackPtr)
+ .addReg(StackPtr).addImm(Amount);
+ }
+ }
+
+ if (New) {
+ // The EFLAGS implicit def is dead.
+ New->getOperand(3).setIsDead();
+
+ // Replace the pseudo instruction with a new instruction...
+ MBB.insert(I, New);
+ }
+ }
+ } else if (I->getOpcode() == getCallFrameDestroyOpcode()) {
+ // If we are performing frame pointer elimination and if the callee pops
+ // something off the stack pointer, add it back. We do this until we have
+ // more advanced stack pointer tracking ability.
+ if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+ unsigned Opc = (CalleeAmt < 128) ?
+ (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+ (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri);
+ MachineInstr *Old = I;
+ MachineInstr *New =
+ BuildMI(MF, Old->getDebugLoc(), TII.get(Opc),
+ StackPtr).addReg(StackPtr).addImm(CalleeAmt);
+ // The EFLAGS implicit def is dead.
+ New->getOperand(3).setIsDead();
+
+ MBB.insert(I, New);
+ }
+ }
+
+ MBB.erase(I);
+}
+
+void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, RegScavenger *RS) const{
+ assert(SPAdj == 0 && "Unexpected");
+
+ unsigned i = 0;
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ while (!MI.getOperand(i).isFI()) {
+ ++i;
+ assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+ }
+
+ int FrameIndex = MI.getOperand(i).getIndex();
+
+ unsigned BasePtr;
+ if (needsStackRealignment(MF))
+ BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+ else
+ BasePtr = (hasFP(MF) ? FramePtr : StackPtr);
+
+ // This must be part of a four operand memory reference. Replace the
+ // FrameIndex with base register with EBP. Add an offset to the offset.
+ MI.getOperand(i).ChangeToRegister(BasePtr, false);
+
+ // Now add the frame object offset to the offset from EBP.
+ if (MI.getOperand(i+3).isImm()) {
+ // Offset is a 32-bit integer.
+ int Offset = getFrameIndexOffset(MF, FrameIndex) +
+ (int)(MI.getOperand(i+3).getImm());
+
+ MI.getOperand(i+3).ChangeToImmediate(Offset);
+ } else {
+ // Offset is symbolic. This is extremely rare.
+ uint64_t Offset = getFrameIndexOffset(MF, FrameIndex) +
+ (uint64_t)MI.getOperand(i+3).getOffset();
+ MI.getOperand(i+3).setOffset(Offset);
+ }
+}
+
+void
+X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+ RegScavenger *RS) const {
+ MachineFrameInfo *FFI = MF.getFrameInfo();
+
+ // Calculate and set max stack object alignment early, so we can decide
+ // whether we will need stack realignment (and thus FP).
+ unsigned MaxAlign = std::max(FFI->getMaxAlignment(),
+ calculateMaxStackAlignment(FFI));
+
+ FFI->setMaxAlignment(MaxAlign);
+}
+
+void
+X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0) {
+ // create RETURNADDR area
+ // arg
+ // arg
+ // RETADDR
+ // { ...
+ // RETADDR area
+ // ...
+ // }
+ // [EBP]
+ MF.getFrameInfo()->
+ CreateFixedObject(-TailCallReturnAddrDelta,
+ (-1*SlotSize)+TailCallReturnAddrDelta);
+ }
+ if (hasFP(MF)) {
+ assert((TailCallReturnAddrDelta <= 0) &&
+ "The Delta should always be zero or negative");
+ // Create a frame entry for the EBP register that must be saved.
+ int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+ (int)SlotSize * -2+
+ TailCallReturnAddrDelta);
+ assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+ "Slot for EBP register must be last in order to be found!");
+ FrameIdx = 0;
+ }
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, int64_t NumBytes, bool Is64Bit,
+ const TargetInstrInfo &TII) {
+ bool isSub = NumBytes < 0;
+ uint64_t Offset = isSub ? -NumBytes : NumBytes;
+ unsigned Opc = isSub
+ ? ((Offset < 128) ?
+ (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+ (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri))
+ : ((Offset < 128) ?
+ (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+ (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri));
+ uint64_t Chunk = (1LL << 31) - 1;
+ DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+ DebugLoc::getUnknownLoc());
+
+ while (Offset) {
+ uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr).addImm(ThisVal);
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ Offset -= ThisVal;
+ }
+}
+
+// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
+static
+void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, uint64_t *NumBytes = NULL) {
+ if (MBBI == MBB.begin()) return;
+
+ MachineBasicBlock::iterator PI = prior(MBBI);
+ unsigned Opc = PI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ }
+}
+
+// mergeSPUpdatesUp - Merge two stack-manipulating instructions lower iterator.
+static
+void mergeSPUpdatesDown(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, uint64_t *NumBytes = NULL) {
+ return;
+
+ if (MBBI == MBB.end()) return;
+
+ MachineBasicBlock::iterator NI = next(MBBI);
+ if (NI == MBB.end()) return;
+
+ unsigned Opc = NI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ NI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes -= NI->getOperand(2).getImm();
+ MBB.erase(NI);
+ MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ NI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes += NI->getOperand(2).getImm();
+ MBB.erase(NI);
+ MBBI = NI;
+ }
+}
+
+/// mergeSPUpdates - Checks the instruction before/after the passed
+/// instruction. If it is an ADD/SUB instruction it is deleted
+/// argument and the stack adjustment is returned as a positive value for ADD
+/// and a negative for SUB.
+static int mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr,
+ bool doMergeWithPrevious) {
+
+ if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+ (!doMergeWithPrevious && MBBI == MBB.end()))
+ return 0;
+
+ int Offset = 0;
+
+ MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
+ MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : next(MBBI);
+ unsigned Opc = PI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr){
+ Offset += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ Offset -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ }
+
+ return Offset;
+}
+
+void X86RegisterInfo::emitFrameMoves(MachineFunction &MF,
+ unsigned FrameLabelId,
+ unsigned ReadyLabelId) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+ if (!MMI)
+ return;
+
+ uint64_t StackSize = MFI->getStackSize();
+ std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+ const TargetData *TD = MF.getTarget().getTargetData();
+
+ // Calculate amount of bytes used for return address storing
+ int stackGrowth =
+ (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+ TargetFrameInfo::StackGrowsUp ?
+ TD->getPointerSize() : -TD->getPointerSize());
+
+ if (StackSize) {
+ // Show update of SP.
+ if (hasFP(MF)) {
+ // Adjust SP
+ MachineLocation SPDst(MachineLocation::VirtualFP);
+ MachineLocation SPSrc(MachineLocation::VirtualFP, 2*stackGrowth);
+ Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+ } else {
+ MachineLocation SPDst(MachineLocation::VirtualFP);
+ MachineLocation SPSrc(MachineLocation::VirtualFP,
+ -StackSize+stackGrowth);
+ Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+ }
+ } else {
+ //FIXME: Verify & implement for FP
+ MachineLocation SPDst(StackPtr);
+ MachineLocation SPSrc(StackPtr, stackGrowth);
+ Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+ }
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+ // FIXME: This is dirty hack. The code itself is pretty mess right now.
+ // It should be rewritten from scratch and generalized sometimes.
+
+ // Determine maximum offset (minumum due to stack growth)
+ int64_t MaxOffset = 0;
+ for (unsigned I = 0, E = CSI.size(); I!=E; ++I)
+ MaxOffset = std::min(MaxOffset,
+ MFI->getObjectOffset(CSI[I].getFrameIdx()));
+
+ // Calculate offsets
+ int64_t saveAreaOffset = (hasFP(MF) ? 3 : 2)*stackGrowth;
+ for (unsigned I = 0, E = CSI.size(); I!=E; ++I) {
+ int64_t Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+ unsigned Reg = CSI[I].getReg();
+ Offset = (MaxOffset-Offset+saveAreaOffset);
+ MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+ MachineLocation CSSrc(Reg);
+ Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+ }
+
+ if (hasFP(MF)) {
+ // Save FP
+ MachineLocation FPDst(MachineLocation::VirtualFP, 2*stackGrowth);
+ MachineLocation FPSrc(FramePtr);
+ Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+ }
+
+ MachineLocation FPDst(hasFP(MF) ? FramePtr : StackPtr);
+ MachineLocation FPSrc(MachineLocation::VirtualFP);
+ Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+}
+
+
+void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
+ MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const Function* Fn = MF.getFunction();
+ const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
+ MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) ||
+ !Fn->doesNotThrow() ||
+ UnwindTablesMandatory;
+ DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+ DebugLoc::getUnknownLoc());
+
+ // Prepare for frame info.
+ unsigned FrameLabelId = 0;
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ uint64_t StackSize = MFI->getStackSize();
+
+ // Get desired stack alignment
+ uint64_t MaxAlign = MFI->getMaxAlignment();
+
+ // Add RETADDR move area to callee saved frame size.
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ X86FI->setCalleeSavedFrameSize(
+ X86FI->getCalleeSavedFrameSize() +(-TailCallReturnAddrDelta));
+
+ // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+ // function, and use up to 128 bytes of stack space, don't have a frame
+ // pointer, calls, or dynamic alloca then we do not need to adjust the
+ // stack pointer (we fit in the Red Zone).
+ if (Is64Bit && !DisableRedZone &&
+ !needsStackRealignment(MF) &&
+ !MFI->hasVarSizedObjects() && // No dynamic alloca.
+ !MFI->hasCalls()) { // No calls.
+ uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+ if (hasFP(MF)) MinSize += SlotSize;
+ StackSize = std::max(MinSize,
+ StackSize > 128 ? StackSize - 128 : 0);
+ MFI->setStackSize(StackSize);
+ }
+
+ // Insert stack pointer adjustment for later moving of return addr. Only
+ // applies to tail call optimized functions where the callee argument stack
+ // size is bigger than the callers.
+ if (TailCallReturnAddrDelta < 0) {
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri),
+ StackPtr).addReg(StackPtr).addImm(-TailCallReturnAddrDelta);
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ }
+
+ uint64_t NumBytes = 0;
+ if (hasFP(MF)) {
+ // Calculate required stack adjustment
+ uint64_t FrameSize = StackSize - SlotSize;
+ if (needsStackRealignment(MF))
+ FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
+
+ NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+ // Get the offset of the stack slot for the EBP register... which is
+ // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+ // Update the frame offset adjustment.
+ MFI->setOffsetAdjustment(-NumBytes);
+
+ // Save EBP into the appropriate stack slot...
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(FramePtr, RegState::Kill);
+
+ if (needsFrameMoves) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ FrameLabelId = MMI->NextLabelID();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(FrameLabelId);
+ }
+
+ // Update EBP with the new base value...
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+ .addReg(StackPtr);
+
+ // Mark the FramePtr as live-in in every block except the entry.
+ for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
+ I != E; ++I)
+ I->addLiveIn(FramePtr);
+
+ // Realign stack
+ if (needsStackRealignment(MF)) {
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri),
+ StackPtr).addReg(StackPtr).addImm(-MaxAlign);
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ }
+ } else {
+ NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+ }
+
+ unsigned ReadyLabelId = 0;
+ if (needsFrameMoves) {
+ // Mark effective beginning of when frame pointer is ready.
+ ReadyLabelId = MMI->NextLabelID();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId);
+ }
+
+ // Skip the callee-saved push instructions.
+ while (MBBI != MBB.end() &&
+ (MBBI->getOpcode() == X86::PUSH32r ||
+ MBBI->getOpcode() == X86::PUSH64r))
+ ++MBBI;
+
+ if (MBBI != MBB.end())
+ DL = MBBI->getDebugLoc();
+
+ if (NumBytes) { // adjust stack pointer: ESP -= numbytes
+ if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
+ // Check, whether EAX is livein for this function
+ bool isEAXAlive = false;
+ for (MachineRegisterInfo::livein_iterator
+ II = MF.getRegInfo().livein_begin(),
+ EE = MF.getRegInfo().livein_end(); (II != EE) && !isEAXAlive; ++II) {
+ unsigned Reg = II->first;
+ isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
+ Reg == X86::AH || Reg == X86::AL);
+ }
+
+ // Function prologue calls _alloca to probe the stack when allocating
+ // more than 4k bytes in one go. Touching the stack at 4K increments is
+ // necessary to ensure that the guard pages used by the OS virtual memory
+ // manager are allocated in correct sequence.
+ if (!isEAXAlive) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(NumBytes);
+ BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("_alloca");
+ } else {
+ // Save EAX
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+ .addReg(X86::EAX, RegState::Kill);
+ // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
+ // allocated bytes for EAX.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes-4);
+ BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("_alloca");
+ // Restore EAX
+ MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+ X86::EAX),
+ StackPtr, false, NumBytes-4);
+ MBB.insert(MBBI, MI);
+ }
+ } else {
+ // If there is an SUB32ri of ESP immediately before this instruction,
+ // merge the two. This can be the case when tail call elimination is
+ // enabled and the callee has more arguments then the caller.
+ NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
+ // If there is an ADD32ri or SUB32ri of ESP immediately after this
+ // instruction, merge the two instructions.
+ mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
+
+ if (NumBytes)
+ emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
+ }
+ }
+
+ if (needsFrameMoves)
+ emitFrameMoves(MF, FrameLabelId, ReadyLabelId);
+}
+
+void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = prior(MBB.end());
+ unsigned RetOpcode = MBBI->getOpcode();
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ switch (RetOpcode) {
+ case X86::RET:
+ case X86::RETI:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNdi64:
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64:
+ case X86::TAILJMPd:
+ case X86::TAILJMPr:
+ case X86::TAILJMPm: break; // These are ok
+ default:
+ assert(0 && "Can only insert epilog into returning blocks");
+ }
+
+ // Get the number of bytes to allocate from the FrameInfo
+ uint64_t StackSize = MFI->getStackSize();
+ uint64_t MaxAlign = MFI->getMaxAlignment();
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ uint64_t NumBytes = 0;
+
+ if (hasFP(MF)) {
+ // Calculate required stack adjustment
+ uint64_t FrameSize = StackSize - SlotSize;
+ if (needsStackRealignment(MF))
+ FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
+
+ NumBytes = FrameSize - CSSize;
+
+ // pop EBP.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+ } else {
+ NumBytes = StackSize - CSSize;
+ }
+
+ // Skip the callee-saved pop instructions.
+ MachineBasicBlock::iterator LastCSPop = MBBI;
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = prior(MBBI);
+ unsigned Opc = PI->getOpcode();
+ if (Opc != X86::POP32r && Opc != X86::POP64r &&
+ !PI->getDesc().isTerminator())
+ break;
+ --MBBI;
+ }
+
+ DL = MBBI->getDebugLoc();
+
+ // If there is an ADD32ri or SUB32ri of ESP immediately before this
+ // instruction, merge the two instructions.
+ if (NumBytes || MFI->hasVarSizedObjects())
+ mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+ // If dynamic alloca is used, then reset esp to point to the last callee-saved
+ // slot before popping them off! Same applies for the case, when stack was
+ // realigned
+ if (needsStackRealignment(MF)) {
+ // We cannot use LEA here, because stack pointer was realigned. We need to
+ // deallocate local frame back
+ if (CSSize) {
+ emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+ MBBI = prior(LastCSPop);
+ }
+
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+ StackPtr).addReg(FramePtr);
+ } else if (MFI->hasVarSizedObjects()) {
+ if (CSSize) {
+ unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
+ MachineInstr *MI = addLeaRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
+ FramePtr, false, -CSSize);
+ MBB.insert(MBBI, MI);
+ } else
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+ StackPtr).addReg(FramePtr);
+
+ } else {
+ // adjust stack pointer back: ESP += numbytes
+ if (NumBytes)
+ emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+ }
+
+ // We're returning from function via eh_return.
+ if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
+ MBBI = prior(MBB.end());
+ MachineOperand &DestAddr = MBBI->getOperand(0);
+ assert(DestAddr.isReg() && "Offset should be in register!");
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+ StackPtr).addReg(DestAddr.getReg());
+ // Tail call return: adjust the stack pointer and jump to callee
+ } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
+ RetOpcode== X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64) {
+ MBBI = prior(MBB.end());
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ MachineOperand &StackAdjust = MBBI->getOperand(1);
+ assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+ int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+ int Offset = 0;
+ assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+ // Incoporate the retaddr area.
+ Offset = StackAdj-MaxTCDelta;
+ assert(Offset >= 0 && "Offset should never be negative");
+
+ if (Offset) {
+ // Check for possible merge with preceeding ADD instruction.
+ Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+ emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII);
+ }
+
+ // Jump to label or value in register.
+ if (RetOpcode == X86::TCRETURNdi|| RetOpcode == X86::TCRETURNdi64)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPd)).
+ addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+ else if (RetOpcode== X86::TCRETURNri64)
+ BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64), JumpTarget.getReg());
+ else
+ BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr), JumpTarget.getReg());
+
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+ } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&
+ (X86FI->getTCReturnAddrDelta() < 0)) {
+ // Add the return addr area delta back since we are not tail calling.
+ int delta = -1*X86FI->getTCReturnAddrDelta();
+ MBBI = prior(MBB.end());
+ // Check for possible merge with preceeding ADD instruction.
+ delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+ emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII);
+ }
+}
+
+unsigned X86RegisterInfo::getRARegister() const {
+ if (Is64Bit)
+ return X86::RIP; // Should have dwarf #16
+ else
+ return X86::EIP; // Should have dwarf #8
+}
+
+unsigned X86RegisterInfo::getFrameRegister(MachineFunction &MF) const {
+ return hasFP(MF) ? FramePtr : StackPtr;
+}
+
+void X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+ const {
+ // Calculate amount of bytes used for return address storing
+ int stackGrowth = (Is64Bit ? -8 : -4);
+
+ // Initial state of the frame pointer is esp+4.
+ MachineLocation Dst(MachineLocation::VirtualFP);
+ MachineLocation Src(StackPtr, stackGrowth);
+ Moves.push_back(MachineMove(0, Dst, Src));
+
+ // Add return address to move list
+ MachineLocation CSDst(StackPtr, stackGrowth);
+ MachineLocation CSSrc(getRARegister());
+ Moves.push_back(MachineMove(0, CSDst, CSSrc));
+}
+
+unsigned X86RegisterInfo::getEHExceptionRegister() const {
+ assert(0 && "What is the exception register");
+ return 0;
+}
+
+unsigned X86RegisterInfo::getEHHandlerRegister() const {
+ assert(0 && "What is the exception handler register");
+ return 0;
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, MVT VT, bool High) {
+ switch (VT.getSimpleVT()) {
+ default: return Reg;
+ case MVT::i8:
+ if (High) {
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AH;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DH;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CH;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BH;
+ }
+ } else {
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AL;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DL;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CL;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BL;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SIL;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DIL;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BPL;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SPL;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8B;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9B;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10B;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11B;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12B;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13B;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14B;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15B;
+ }
+ }
+ case MVT::i16:
+ switch (Reg) {
+ default: return Reg;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8W;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9W;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10W;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11W;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12W;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13W;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14W;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15W;
+ }
+ case MVT::i32:
+ switch (Reg) {
+ default: return Reg;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::EAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::EDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::ECX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::EBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::ESI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::EDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::EBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::ESP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8D;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9D;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10D;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11D;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12D;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13D;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14D;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15D;
+ }
+ case MVT::i64:
+ switch (Reg) {
+ default: return Reg;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::RAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::RDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::RCX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::RBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::RSI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::RDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::RBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::RSP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15;
+ }
+ }
+
+ return Reg;
+}
+}
+
+#include "X86GenRegisterInfo.inc"
+
+namespace {
+ struct VISIBILITY_HIDDEN MSAC : public MachineFunctionPass {
+ static char ID;
+ MSAC() : MachineFunctionPass(&ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF) {
+ MachineFrameInfo *FFI = MF.getFrameInfo();
+ MachineRegisterInfo &RI = MF.getRegInfo();
+
+ // Calculate max stack alignment of all already allocated stack objects.
+ unsigned MaxAlign = calculateMaxStackAlignment(FFI);
+
+ // Be over-conservative: scan over all vreg defs and find, whether vector
+ // registers are used. If yes - there is probability, that vector register
+ // will be spilled and thus stack needs to be aligned properly.
+ for (unsigned RegNum = TargetRegisterInfo::FirstVirtualRegister;
+ RegNum < RI.getLastVirtReg(); ++RegNum)
+ MaxAlign = std::max(MaxAlign, RI.getRegClass(RegNum)->getAlignment());
+
+ FFI->setMaxAlignment(MaxAlign);
+
+ return false;
+ }
+
+ virtual const char *getPassName() const {
+ return "X86 Maximal Stack Alignment Calculator";
+ }
+ };
+
+ char MSAC::ID = 0;
+}
+
+FunctionPass*
+llvm::createX86MaxStackAlignmentCalculatorPass() { return new MSAC(); }
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..33b9f5e
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,163 @@
+//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "X86GenRegisterInfo.h.inc"
+
+namespace llvm {
+ class Type;
+ class TargetInstrInfo;
+ class X86TargetMachine;
+
+/// N86 namespace - Native X86 register numbers
+///
+namespace N86 {
+ enum {
+ EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+ };
+}
+
+namespace X86 {
+ /// SubregIndex - The index of various sized subregister classes. Note that
+ /// these indices must be kept in sync with the class indices in the
+ /// X86RegisterInfo.td file.
+ enum SubregIndex {
+ SUBREG_8BIT = 1, SUBREG_8BIT_HI = 2, SUBREG_16BIT = 3, SUBREG_32BIT = 4
+ };
+}
+
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+ enum {
+ X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+ };
+}
+
+class X86RegisterInfo : public X86GenRegisterInfo {
+public:
+ X86TargetMachine &TM;
+ const TargetInstrInfo &TII;
+
+private:
+ /// Is64Bit - Is the target 64-bits.
+ ///
+ bool Is64Bit;
+
+ /// IsWin64 - Is the target on of win64 flavours
+ ///
+ bool IsWin64;
+
+ /// SlotSize - Stack slot size in bytes.
+ ///
+ unsigned SlotSize;
+
+ /// StackAlign - Default stack alignment.
+ ///
+ unsigned StackAlign;
+
+ /// StackPtr - X86 physical register used as stack ptr.
+ ///
+ unsigned StackPtr;
+
+ /// FramePtr - X86 physical register used as frame ptr.
+ ///
+ unsigned FramePtr;
+
+public:
+ X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+
+ /// getX86RegNum - Returns the native X86 register number for the given LLVM
+ /// register identifier.
+ static unsigned getX86RegNum(unsigned RegNo);
+
+ unsigned getStackAlignment() const { return StackAlign; }
+
+ /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum
+ /// (created by TableGen) for target dependencies.
+ int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+ /// Code Generation virtual methods...
+ ///
+
+ /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+ /// values.
+ const TargetRegisterClass *getPointerRegClass() const;
+
+ /// getCrossCopyRegClass - Returns a legal register class to copy a register
+ /// in the specified class to or from. Returns NULL if it is possible to copy
+ /// between a two registers of the specified class.
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+ /// getCalleeSavedRegs - Return a null-terminated list of all of the
+ /// callee-save registers on this target.
+ const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+ /// getCalleeSavedRegClasses - Return a null-terminated list of the preferred
+ /// register classes to spill each callee-saved register with. The order and
+ /// length of this list match the getCalleeSavedRegs() list.
+ const TargetRegisterClass* const*
+ getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+ /// getReservedRegs - Returns a bitset indexed by physical register number
+ /// indicating if a register is a special register that has particular uses and
+ /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+ /// register scavenger to determine what registers are free.
+ BitVector getReservedRegs(const MachineFunction &MF) const;
+
+ bool hasFP(const MachineFunction &MF) const;
+
+ bool needsStackRealignment(const MachineFunction &MF) const;
+
+ bool hasReservedCallFrame(MachineFunction &MF) const;
+
+ void eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, RegScavenger *RS = NULL) const;
+
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+ void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+ RegScavenger *RS = NULL) const;
+
+ void emitPrologue(MachineFunction &MF) const;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+ void emitFrameMoves(MachineFunction &MF,
+ unsigned FrameLabelId, unsigned ReadyLabelId) const;
+
+ // Debug information queries.
+ unsigned getRARegister() const;
+ unsigned getFrameRegister(MachineFunction &MF) const;
+ int getFrameIndexOffset(MachineFunction &MF, int FI) const;
+ void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+ // Exception handling queries.
+ unsigned getEHExceptionRegister() const;
+ unsigned getEHHandlerRegister() const;
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT, bool High=false);
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..d552cb3
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,762 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Register definitions...
+//
+let Namespace = "X86" in {
+
+ // In the register alias definitions below, we define which registers alias
+ // which others. We only specify which registers the small registers alias,
+ // because the register file generator is smart enough to figure out that
+ // AL aliases AX if we tell it that AX aliased AL (for example).
+
+ // Dwarf numbering is different for 32-bit and 64-bit, and there are
+ // variations by target as well. Currently the first entry is for X86-64,
+ // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+ // and debug information on X86-32/Darwin)
+
+ // 8-bit registers
+ // Low registers
+ def AL : Register<"al">, DwarfRegNum<[0, 0, 0]>;
+ def DL : Register<"dl">, DwarfRegNum<[1, 2, 2]>;
+ def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>;
+ def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>;
+
+ // X86-64 only
+ def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>;
+ def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>;
+ def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>;
+ def SPL : Register<"spl">, DwarfRegNum<[7, 5, 4]>;
+ def R8B : Register<"r8b">, DwarfRegNum<[8, -2, -2]>;
+ def R9B : Register<"r9b">, DwarfRegNum<[9, -2, -2]>;
+ def R10B : Register<"r10b">, DwarfRegNum<[10, -2, -2]>;
+ def R11B : Register<"r11b">, DwarfRegNum<[11, -2, -2]>;
+ def R12B : Register<"r12b">, DwarfRegNum<[12, -2, -2]>;
+ def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>;
+ def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>;
+ def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>;
+
+ // High registers. On x86-64, these cannot be used in any instruction
+ // with a REX prefix.
+ def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>;
+ def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>;
+ def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>;
+ def BH : Register<"bh">, DwarfRegNum<[3, 3, 3]>;
+
+ // 16-bit registers
+ def AX : RegisterWithSubRegs<"ax", [AL,AH]>, DwarfRegNum<[0, 0, 0]>;
+ def DX : RegisterWithSubRegs<"dx", [DL,DH]>, DwarfRegNum<[1, 2, 2]>;
+ def CX : RegisterWithSubRegs<"cx", [CL,CH]>, DwarfRegNum<[2, 1, 1]>;
+ def BX : RegisterWithSubRegs<"bx", [BL,BH]>, DwarfRegNum<[3, 3, 3]>;
+ def SI : RegisterWithSubRegs<"si", [SIL]>, DwarfRegNum<[4, 6, 6]>;
+ def DI : RegisterWithSubRegs<"di", [DIL]>, DwarfRegNum<[5, 7, 7]>;
+ def BP : RegisterWithSubRegs<"bp", [BPL]>, DwarfRegNum<[6, 4, 5]>;
+ def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>;
+ def IP : Register<"ip">, DwarfRegNum<[16]>;
+
+ // X86-64 only
+ def R8W : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>;
+ def R9W : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>;
+ def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>;
+ def R11W : RegisterWithSubRegs<"r11w", [R11B]>, DwarfRegNum<[11, -2, -2]>;
+ def R12W : RegisterWithSubRegs<"r12w", [R12B]>, DwarfRegNum<[12, -2, -2]>;
+ def R13W : RegisterWithSubRegs<"r13w", [R13B]>, DwarfRegNum<[13, -2, -2]>;
+ def R14W : RegisterWithSubRegs<"r14w", [R14B]>, DwarfRegNum<[14, -2, -2]>;
+ def R15W : RegisterWithSubRegs<"r15w", [R15B]>, DwarfRegNum<[15, -2, -2]>;
+
+ // 32-bit registers
+ def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[0, 0, 0]>;
+ def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[1, 2, 2]>;
+ def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[2, 1, 1]>;
+ def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[3, 3, 3]>;
+ def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[4, 6, 6]>;
+ def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>;
+ def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>;
+ def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>;
+ def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>;
+
+ // X86-64 only
+ def R8D : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>;
+ def R9D : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>;
+ def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>;
+ def R11D : RegisterWithSubRegs<"r11d", [R11W]>, DwarfRegNum<[11, -2, -2]>;
+ def R12D : RegisterWithSubRegs<"r12d", [R12W]>, DwarfRegNum<[12, -2, -2]>;
+ def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>;
+ def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>;
+ def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>;
+
+ // 64-bit registers, X86-64 only
+ def RAX : RegisterWithSubRegs<"rax", [EAX]>, DwarfRegNum<[0, -2, -2]>;
+ def RDX : RegisterWithSubRegs<"rdx", [EDX]>, DwarfRegNum<[1, -2, -2]>;
+ def RCX : RegisterWithSubRegs<"rcx", [ECX]>, DwarfRegNum<[2, -2, -2]>;
+ def RBX : RegisterWithSubRegs<"rbx", [EBX]>, DwarfRegNum<[3, -2, -2]>;
+ def RSI : RegisterWithSubRegs<"rsi", [ESI]>, DwarfRegNum<[4, -2, -2]>;
+ def RDI : RegisterWithSubRegs<"rdi", [EDI]>, DwarfRegNum<[5, -2, -2]>;
+ def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>;
+ def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+ def R8 : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>;
+ def R9 : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>;
+ def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>;
+ def R11 : RegisterWithSubRegs<"r11", [R11D]>, DwarfRegNum<[11, -2, -2]>;
+ def R12 : RegisterWithSubRegs<"r12", [R12D]>, DwarfRegNum<[12, -2, -2]>;
+ def R13 : RegisterWithSubRegs<"r13", [R13D]>, DwarfRegNum<[13, -2, -2]>;
+ def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>;
+ def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>;
+ def RIP : RegisterWithSubRegs<"rip", [EIP]>, DwarfRegNum<[16, -2, -2]>;
+
+ // MMX Registers. These are actually aliased to ST0 .. ST7
+ def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>;
+ def MM1 : Register<"mm1">, DwarfRegNum<[42, 30, 30]>;
+ def MM2 : Register<"mm2">, DwarfRegNum<[43, 31, 31]>;
+ def MM3 : Register<"mm3">, DwarfRegNum<[44, 32, 32]>;
+ def MM4 : Register<"mm4">, DwarfRegNum<[45, 33, 33]>;
+ def MM5 : Register<"mm5">, DwarfRegNum<[46, 34, 34]>;
+ def MM6 : Register<"mm6">, DwarfRegNum<[47, 35, 35]>;
+ def MM7 : Register<"mm7">, DwarfRegNum<[48, 36, 36]>;
+
+ // Pseudo Floating Point registers
+ def FP0 : Register<"fp0">;
+ def FP1 : Register<"fp1">;
+ def FP2 : Register<"fp2">;
+ def FP3 : Register<"fp3">;
+ def FP4 : Register<"fp4">;
+ def FP5 : Register<"fp5">;
+ def FP6 : Register<"fp6">;
+
+ // XMM Registers, used by the various SSE instruction set extensions
+ def XMM0: Register<"xmm0">, DwarfRegNum<[17, 21, 21]>;
+ def XMM1: Register<"xmm1">, DwarfRegNum<[18, 22, 22]>;
+ def XMM2: Register<"xmm2">, DwarfRegNum<[19, 23, 23]>;
+ def XMM3: Register<"xmm3">, DwarfRegNum<[20, 24, 24]>;
+ def XMM4: Register<"xmm4">, DwarfRegNum<[21, 25, 25]>;
+ def XMM5: Register<"xmm5">, DwarfRegNum<[22, 26, 26]>;
+ def XMM6: Register<"xmm6">, DwarfRegNum<[23, 27, 27]>;
+ def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>;
+
+ // X86-64 only
+ def XMM8: Register<"xmm8">, DwarfRegNum<[25, -2, -2]>;
+ def XMM9: Register<"xmm9">, DwarfRegNum<[26, -2, -2]>;
+ def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>;
+ def XMM11: Register<"xmm11">, DwarfRegNum<[28, -2, -2]>;
+ def XMM12: Register<"xmm12">, DwarfRegNum<[29, -2, -2]>;
+ def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>;
+ def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>;
+ def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
+
+ // Floating point stack registers
+ def ST0 : Register<"st(0)">, DwarfRegNum<[33, 12, 11]>;
+ def ST1 : Register<"st(1)">, DwarfRegNum<[34, 13, 12]>;
+ def ST2 : Register<"st(2)">, DwarfRegNum<[35, 14, 13]>;
+ def ST3 : Register<"st(3)">, DwarfRegNum<[36, 15, 14]>;
+ def ST4 : Register<"st(4)">, DwarfRegNum<[37, 16, 15]>;
+ def ST5 : Register<"st(5)">, DwarfRegNum<[38, 17, 16]>;
+ def ST6 : Register<"st(6)">, DwarfRegNum<[39, 18, 17]>;
+ def ST7 : Register<"st(7)">, DwarfRegNum<[40, 19, 18]>;
+
+ // Status flags register
+ def EFLAGS : Register<"flags">;
+
+ // Segment registers
+ def CS : Register<"cs">;
+ def DS : Register<"ds">;
+ def SS : Register<"ss">;
+ def ES : Register<"es">;
+ def FS : Register<"fs">;
+ def GS : Register<"gs">;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Subregister Set Definitions... now that we have all of the pieces, define the
+// sub registers for each register.
+//
+
+def x86_subreg_8bit : PatLeaf<(i32 1)>;
+def x86_subreg_8bit_hi : PatLeaf<(i32 2)>;
+def x86_subreg_16bit : PatLeaf<(i32 3)>;
+def x86_subreg_32bit : PatLeaf<(i32 4)>;
+
+def : SubRegSet<1, [AX, CX, DX, BX, SP, BP, SI, DI,
+ R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W],
+ [AL, CL, DL, BL, SPL, BPL, SIL, DIL,
+ R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def : SubRegSet<2, [AX, CX, DX, BX],
+ [AH, CH, DH, BH]>;
+
+def : SubRegSet<1, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
+ R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
+ [AL, CL, DL, BL, SPL, BPL, SIL, DIL,
+ R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def : SubRegSet<2, [EAX, ECX, EDX, EBX],
+ [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
+ R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
+ [AX, CX, DX, BX, SP, BP, SI, DI,
+ R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
+
+def : SubRegSet<1, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
+ R8, R9, R10, R11, R12, R13, R14, R15],
+ [AL, CL, DL, BL, SPL, BPL, SIL, DIL,
+ R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def : SubRegSet<2, [RAX, RCX, RDX, RBX],
+ [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
+ R8, R9, R10, R11, R12, R13, R14, R15],
+ [AX, CX, DX, BX, SP, BP, SI, DI,
+ R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
+
+def : SubRegSet<4, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
+ R8, R9, R10, R11, R12, R13, R14, R15],
+ [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
+ R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes. The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8], 8,
+ [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL,
+ R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> {
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate SPL or BPL.
+ static const unsigned X86_GR8_AO_64_fp[] = {
+ X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+ X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+ X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B
+ };
+ // If not, just don't allocate SPL.
+ static const unsigned X86_GR8_AO_64[] = {
+ X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+ X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+ X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL
+ };
+ // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+ static const unsigned X86_GR8_AO_32[] = {
+ X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH
+ };
+
+ GR8Class::iterator
+ GR8Class::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return X86_GR8_AO_32;
+ else if (RI->hasFP(MF))
+ return X86_GR8_AO_64_fp;
+ else
+ return X86_GR8_AO_64;
+ }
+
+ GR8Class::iterator
+ GR8Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return X86_GR8_AO_32 + (sizeof(X86_GR8_AO_32) / sizeof(unsigned));
+ else if (RI->hasFP(MF))
+ return X86_GR8_AO_64_fp + (sizeof(X86_GR8_AO_64_fp) / sizeof(unsigned));
+ else
+ return X86_GR8_AO_64 + (sizeof(X86_GR8_AO_64) / sizeof(unsigned));
+ }
+ }];
+}
+
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+ [AX, CX, DX, SI, DI, BX, BP, SP,
+ R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> {
+ let SubRegClassList = [GR8, GR8];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate SP or BP.
+ static const unsigned X86_GR16_AO_64_fp[] = {
+ X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+ X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+ X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W
+ };
+ static const unsigned X86_GR16_AO_32_fp[] = {
+ X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX
+ };
+ // If not, just don't allocate SP.
+ static const unsigned X86_GR16_AO_64[] = {
+ X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+ X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+ X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP
+ };
+ static const unsigned X86_GR16_AO_32[] = {
+ X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP
+ };
+
+ GR16Class::iterator
+ GR16Class::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit()) {
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_64_fp;
+ else
+ return X86_GR16_AO_64;
+ } else {
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_32_fp;
+ else
+ return X86_GR16_AO_32;
+ }
+ }
+
+ GR16Class::iterator
+ GR16Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit()) {
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_64_fp+(sizeof(X86_GR16_AO_64_fp)/sizeof(unsigned));
+ else
+ return X86_GR16_AO_64 + (sizeof(X86_GR16_AO_64) / sizeof(unsigned));
+ } else {
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_32_fp+(sizeof(X86_GR16_AO_32_fp)/sizeof(unsigned));
+ else
+ return X86_GR16_AO_32 + (sizeof(X86_GR16_AO_32) / sizeof(unsigned));
+ }
+ }
+ }];
+}
+
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+ [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+ R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
+ let SubRegClassList = [GR8, GR8, GR16];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate ESP or EBP.
+ static const unsigned X86_GR32_AO_64_fp[] = {
+ X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+ X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+ X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D
+ };
+ static const unsigned X86_GR32_AO_32_fp[] = {
+ X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX
+ };
+ // If not, just don't allocate ESP.
+ static const unsigned X86_GR32_AO_64[] = {
+ X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+ X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+ X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
+ };
+ static const unsigned X86_GR32_AO_32[] = {
+ X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP
+ };
+
+ GR32Class::iterator
+ GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit()) {
+ if (RI->hasFP(MF))
+ return X86_GR32_AO_64_fp;
+ else
+ return X86_GR32_AO_64;
+ } else {
+ if (RI->hasFP(MF))
+ return X86_GR32_AO_32_fp;
+ else
+ return X86_GR32_AO_32;
+ }
+ }
+
+ GR32Class::iterator
+ GR32Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit()) {
+ if (RI->hasFP(MF))
+ return X86_GR32_AO_64_fp+(sizeof(X86_GR32_AO_64_fp)/sizeof(unsigned));
+ else
+ return X86_GR32_AO_64 + (sizeof(X86_GR32_AO_64) / sizeof(unsigned));
+ } else {
+ if (RI->hasFP(MF))
+ return X86_GR32_AO_32_fp+(sizeof(X86_GR32_AO_32_fp)/sizeof(unsigned));
+ else
+ return X86_GR32_AO_32 + (sizeof(X86_GR32_AO_32) / sizeof(unsigned));
+ }
+ }
+ }];
+}
+
+
+def GR64 : RegisterClass<"X86", [i64], 64,
+ [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ RBX, R14, R15, R12, R13, RBP, RSP]> {
+ let SubRegClassList = [GR8, GR8, GR16, GR32];
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ GR64Class::iterator
+ GR64Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return begin(); // None of these are allocatable in 32-bit.
+ if (RI->hasFP(MF)) // Does the function dedicate RBP to being a frame ptr?
+ return end()-2; // If so, don't allocate RSP or RBP
+ else
+ return end()-1; // If not, just don't allocate RSP
+ }
+ }];
+}
+
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, [AL, CL, DL, BL]> {
+}
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, [AH, CH, DH, BH]> {
+}
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]> {
+ let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H];
+}
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]> {
+ let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD];
+}
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> {
+ let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD];
+}
+
+// GR8_NOREX, GR16_NOREX, GR32_NOREX, GR64_NOREX - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain only the first 8 GPRs.
+// On x86-64, GR64_NOREX, GR32_NOREX and GR16_NOREX are the classes
+// of registers which do not by themselves require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+ [AL, CL, DL, BL, AH, CH, DH, BH,
+ SIL, DIL, BPL, SPL]> {
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate SPL or BPL.
+ static const unsigned X86_GR8_NOREX_AO_64_fp[] = {
+ X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL, X86::BL
+ };
+ // If not, just don't allocate SPL.
+ static const unsigned X86_GR8_NOREX_AO_64[] = {
+ X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL, X86::BL, X86::BPL
+ };
+ // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+ static const unsigned X86_GR8_NOREX_AO_32[] = {
+ X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH
+ };
+
+ GR8_NOREXClass::iterator
+ GR8_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return X86_GR8_NOREX_AO_32;
+ else if (RI->hasFP(MF))
+ return X86_GR8_NOREX_AO_64_fp;
+ else
+ return X86_GR8_NOREX_AO_64;
+ }
+
+ GR8_NOREXClass::iterator
+ GR8_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return X86_GR8_NOREX_AO_32 +
+ (sizeof(X86_GR8_NOREX_AO_32) / sizeof(unsigned));
+ else if (RI->hasFP(MF))
+ return X86_GR8_NOREX_AO_64_fp +
+ (sizeof(X86_GR8_NOREX_AO_64_fp) / sizeof(unsigned));
+ else
+ return X86_GR8_NOREX_AO_64 +
+ (sizeof(X86_GR8_NOREX_AO_64) / sizeof(unsigned));
+ }
+ }];
+}
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+ [AX, CX, DX, SI, DI, BX, BP, SP]> {
+ let SubRegClassList = [GR8_NOREX, GR8_NOREX];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate SP or BP.
+ static const unsigned X86_GR16_AO_fp[] = {
+ X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX
+ };
+ // If not, just don't allocate SP.
+ static const unsigned X86_GR16_AO[] = {
+ X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP
+ };
+
+ GR16_NOREXClass::iterator
+ GR16_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_fp;
+ else
+ return X86_GR16_AO;
+ }
+
+ GR16_NOREXClass::iterator
+ GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_fp+(sizeof(X86_GR16_AO_fp)/sizeof(unsigned));
+ else
+ return X86_GR16_AO + (sizeof(X86_GR16_AO) / sizeof(unsigned));
+ }
+ }];
+}
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+ [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
+ let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate ESP or EBP.
+ static const unsigned X86_GR32_NOREX_AO_fp[] = {
+ X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX
+ };
+ // If not, just don't allocate ESP.
+ static const unsigned X86_GR32_NOREX_AO[] = {
+ X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP
+ };
+
+ GR32_NOREXClass::iterator
+ GR32_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ if (RI->hasFP(MF))
+ return X86_GR32_NOREX_AO_fp;
+ else
+ return X86_GR32_NOREX_AO;
+ }
+
+ GR32_NOREXClass::iterator
+ GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ if (RI->hasFP(MF))
+ return X86_GR32_NOREX_AO_fp +
+ (sizeof(X86_GR32_NOREX_AO_fp) / sizeof(unsigned));
+ else
+ return X86_GR32_NOREX_AO +
+ (sizeof(X86_GR32_NOREX_AO) / sizeof(unsigned));
+ }
+ }];
+}
+
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+ [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP]> {
+ let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX, GR32_NOREX];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate RSP or RBP.
+ static const unsigned X86_GR64_NOREX_AO_fp[] = {
+ X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX
+ };
+ // If not, just don't allocate RSP.
+ static const unsigned X86_GR64_NOREX_AO[] = {
+ X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX, X86::RBP
+ };
+
+ GR64_NOREXClass::iterator
+ GR64_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ if (RI->hasFP(MF))
+ return X86_GR64_NOREX_AO_fp;
+ else
+ return X86_GR64_NOREX_AO;
+ }
+
+ GR64_NOREXClass::iterator
+ GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ if (RI->hasFP(MF))
+ return X86_GR64_NOREX_AO_fp +
+ (sizeof(X86_GR64_NOREX_AO_fp) / sizeof(unsigned));
+ else
+ return X86_GR64_NOREX_AO +
+ (sizeof(X86_GR64_NOREX_AO) / sizeof(unsigned));
+ }
+ }];
+}
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GRAD : RegisterClass<"X86", [i32], 32, [EAX, EDX]>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32,
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11,
+ XMM12, XMM13, XMM14, XMM15]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ FR32Class::iterator
+ FR32Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+ else
+ return end();
+ }
+ }];
+}
+
+def FR64 : RegisterClass<"X86", [f64], 64,
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11,
+ XMM12, XMM13, XMM14, XMM15]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ FR64Class::iterator
+ FR64Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+ else
+ return end();
+ }
+ }];
+}
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values. This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware. In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP64 : RegisterClass<"X86",[f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP80 : RegisterClass<"X86",[f80], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32,
+ [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ RSTClass::iterator
+ RSTClass::allocation_order_end(const MachineFunction &MF) const {
+ return begin();
+ }
+ }];
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64 : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+ [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11,
+ XMM12, XMM13, XMM14, XMM15]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ VR128Class::iterator
+ VR128Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+ else
+ return end();
+ }
+ }];
+}
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+}
diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
new file mode 100644
index 0000000..b225f48
--- /dev/null
+++ b/lib/Target/X86/X86Relocations.h
@@ -0,0 +1,42 @@
+//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+ namespace X86 {
+ /// RelocationType - An enum for the x86 relocation codes. Note that
+ /// the terminology here doesn't follow x86 convention - word means
+ /// 32-bit and dword means 64-bit.
+ enum RelocationType {
+ // reloc_pcrel_word - PC relative relocation, add the relocated value to
+ // the value already in memory, after we adjust it for where the PC is.
+ reloc_pcrel_word = 0,
+
+ // reloc_picrel_word - PIC base relative relocation, add the relocated
+ // value to the value already in memory, after we adjust it for where the
+ // PIC base is.
+ reloc_picrel_word = 1,
+
+ // reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
+ // add the relocated value to the value already in memory.
+ reloc_absolute_word = 2,
+ reloc_absolute_dword = 3
+ };
+ }
+}
+
+#endif
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..03ce1ae
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,446 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "subtarget"
+#include "X86Subtarget.h"
+#include "X86GenSubtarget.inc"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#if defined(_MSC_VER)
+ #include <intrin.h>
+#endif
+
+static cl::opt<X86Subtarget::AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(X86Subtarget::Unset),
+ cl::desc("Choose style of code to emit from X86 backend:"),
+ cl::values(
+ clEnumValN(X86Subtarget::ATT, "att", "Emit AT&T-style assembly"),
+ clEnumValN(X86Subtarget::Intel, "intel", "Emit Intel-style assembly"),
+ clEnumValEnd));
+
+
+/// True if accessing the GV requires an extra load. For Windows, dllimported
+/// symbols are indirect, loading the value at address GV rather then the
+/// value of GV itself. This means that the GlobalAddress must be in the base
+/// or index register of the address, not the GV offset field.
+bool X86Subtarget::GVRequiresExtraLoad(const GlobalValue* GV,
+ const TargetMachine& TM,
+ bool isDirectCall) const
+{
+ // FIXME: PIC
+ if (TM.getRelocationModel() != Reloc::Static &&
+ TM.getCodeModel() != CodeModel::Large) {
+ if (isTargetDarwin()) {
+ if (isDirectCall)
+ return false;
+ bool isDecl = GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode();
+ if (GV->hasHiddenVisibility() &&
+ (Is64Bit || (!isDecl && !GV->hasCommonLinkage())))
+ // If symbol visibility is hidden, the extra load is not needed if
+ // target is x86-64 or the symbol is definitely defined in the current
+ // translation unit.
+ return false;
+ return !isDirectCall && (isDecl || GV->isWeakForLinker());
+ } else if (isTargetELF()) {
+ // Extra load is needed for all externally visible.
+ if (isDirectCall)
+ return false;
+ if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+ return false;
+ return true;
+ } else if (isTargetCygMing() || isTargetWindows()) {
+ return (GV->hasDLLImportLinkage());
+ }
+ }
+ return false;
+}
+
+/// True if accessing the GV requires a register. This is a superset of the
+/// cases where GVRequiresExtraLoad is true. Some variations of PIC require
+/// a register, but not an extra load.
+bool X86Subtarget::GVRequiresRegister(const GlobalValue *GV,
+ const TargetMachine& TM,
+ bool isDirectCall) const
+{
+ if (GVRequiresExtraLoad(GV, TM, isDirectCall))
+ return true;
+ // Code below here need only consider cases where GVRequiresExtraLoad
+ // returns false.
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return !isDirectCall &&
+ (GV->hasLocalLinkage() || GV->hasExternalLinkage());
+ return false;
+}
+
+/// getBZeroEntry - This function returns the name of a function which has an
+/// interface like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+ // Darwin 10 has a __bzero entry point for this purpose.
+ if (getDarwinVers() >= 10)
+ return "__bzero";
+
+ return 0;
+}
+
+/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+/// to immediate address.
+bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+ if (Is64Bit)
+ return false;
+ return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+/// getSpecialAddressLatency - For targets where it is beneficial to
+/// backschedule instructions that compute addresses, return a value
+/// indicating the number of scheduling cycles of backscheduling that
+/// should be attempted.
+unsigned X86Subtarget::getSpecialAddressLatency() const {
+ // For x86 out-of-order targets, back-schedule address computations so
+ // that loads and stores aren't blocked.
+ // This value was chosen arbitrarily.
+ return 200;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments. If we can't run cpuid on the host, return true.
+bool X86::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+ unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64)
+ #if defined(__GNUC__)
+ // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+ asm ("movq\t%%rbx, %%rsi\n\t"
+ "cpuid\n\t"
+ "xchgq\t%%rbx, %%rsi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value));
+ return false;
+ #elif defined(_MSC_VER)
+ int registers[4];
+ __cpuid(registers, value);
+ *rEAX = registers[0];
+ *rEBX = registers[1];
+ *rECX = registers[2];
+ *rEDX = registers[3];
+ return false;
+ #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+ #if defined(__GNUC__)
+ asm ("movl\t%%ebx, %%esi\n\t"
+ "cpuid\n\t"
+ "xchgl\t%%ebx, %%esi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value));
+ return false;
+ #elif defined(_MSC_VER)
+ __asm {
+ mov eax,value
+ cpuid
+ mov esi,rEAX
+ mov dword ptr [esi],eax
+ mov esi,rEBX
+ mov dword ptr [esi],ebx
+ mov esi,rECX
+ mov dword ptr [esi],ecx
+ mov esi,rEDX
+ mov dword ptr [esi],edx
+ }
+ return false;
+ #endif
+#endif
+ return true;
+}
+
+static void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model) {
+ Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+ Model = (EAX >> 4) & 0xf; // Bits 4 - 7
+ if (Family == 6 || Family == 0xf) {
+ if (Family == 0xf)
+ // Examine extended family ID if family ID is F.
+ Family += (EAX >> 20) & 0xff; // Bits 20 - 27
+ // Examine extended model ID if family ID is 6 or F.
+ Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+ }
+}
+
+void X86Subtarget::AutoDetectSubtargetFeatures() {
+ unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+ union {
+ unsigned u[3];
+ char c[12];
+ } text;
+
+ if (X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1))
+ return;
+
+ X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+
+ if ((EDX >> 23) & 0x1) X86SSELevel = MMX;
+ if ((EDX >> 25) & 0x1) X86SSELevel = SSE1;
+ if ((EDX >> 26) & 0x1) X86SSELevel = SSE2;
+ if (ECX & 0x1) X86SSELevel = SSE3;
+ if ((ECX >> 9) & 0x1) X86SSELevel = SSSE3;
+ if ((ECX >> 19) & 0x1) X86SSELevel = SSE41;
+ if ((ECX >> 20) & 0x1) X86SSELevel = SSE42;
+
+ bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
+ bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
+ if (IsIntel || IsAMD) {
+ // Determine if bit test memory instructions are slow.
+ unsigned Family = 0;
+ unsigned Model = 0;
+ DetectFamilyModel(EAX, Family, Model);
+ IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13);
+
+ X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+ HasX86_64 = (EDX >> 29) & 0x1;
+ HasSSE4A = IsAMD && ((ECX >> 6) & 0x1);
+ }
+}
+
+static const char *GetCurrentX86CPU() {
+ unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+ if (X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
+ return "generic";
+ unsigned Family = 0;
+ unsigned Model = 0;
+ DetectFamilyModel(EAX, Family, Model);
+
+ X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+ bool Em64T = (EDX >> 29) & 0x1;
+ bool HasSSE3 = (ECX & 0x1);
+
+ union {
+ unsigned u[3];
+ char c[12];
+ } text;
+
+ X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+ if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+ switch (Family) {
+ case 3:
+ return "i386";
+ case 4:
+ return "i486";
+ case 5:
+ switch (Model) {
+ case 4: return "pentium-mmx";
+ default: return "pentium";
+ }
+ case 6:
+ switch (Model) {
+ case 1: return "pentiumpro";
+ case 3:
+ case 5:
+ case 6: return "pentium2";
+ case 7:
+ case 8:
+ case 10:
+ case 11: return "pentium3";
+ case 9:
+ case 13: return "pentium-m";
+ case 14: return "yonah";
+ case 15:
+ case 22: // Celeron M 540
+ return "core2";
+ case 23: // 45nm: Penryn , Wolfdale, Yorkfield (XE)
+ return "penryn";
+ default: return "i686";
+ }
+ case 15: {
+ switch (Model) {
+ case 3:
+ case 4:
+ case 6: // same as 4, but 65nm
+ return (Em64T) ? "nocona" : "prescott";
+ case 26:
+ return "corei7";
+ case 28:
+ return "atom";
+ default:
+ return (Em64T) ? "x86-64" : "pentium4";
+ }
+ }
+
+ default:
+ return "generic";
+ }
+ } else if (memcmp(text.c, "AuthenticAMD", 12) == 0) {
+ // FIXME: this poorly matches the generated SubtargetFeatureKV table. There
+ // appears to be no way to generate the wide variety of AMD-specific targets
+ // from the information returned from CPUID.
+ switch (Family) {
+ case 4:
+ return "i486";
+ case 5:
+ switch (Model) {
+ case 6:
+ case 7: return "k6";
+ case 8: return "k6-2";
+ case 9:
+ case 13: return "k6-3";
+ default: return "pentium";
+ }
+ case 6:
+ switch (Model) {
+ case 4: return "athlon-tbird";
+ case 6:
+ case 7:
+ case 8: return "athlon-mp";
+ case 10: return "athlon-xp";
+ default: return "athlon";
+ }
+ case 15:
+ if (HasSSE3) {
+ switch (Model) {
+ default: return "k8-sse3";
+ }
+ } else {
+ switch (Model) {
+ case 1: return "opteron";
+ case 5: return "athlon-fx"; // also opteron
+ default: return "athlon64";
+ }
+ }
+ case 16:
+ switch (Model) {
+ default: return "amdfam10";
+ }
+ default:
+ return "generic";
+ }
+ } else {
+ return "generic";
+ }
+}
+
+X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
+ : AsmFlavor(AsmWriterFlavor)
+ , PICStyle(PICStyles::None)
+ , X86SSELevel(NoMMXSSE)
+ , X863DNowLevel(NoThreeDNow)
+ , HasX86_64(false)
+ , IsBTMemSlow(false)
+ , DarwinVers(0)
+ , IsLinux(false)
+ , stackAlignment(8)
+ // FIXME: this is a known good value for Yonah. How about others?
+ , MaxInlineSizeThreshold(128)
+ , Is64Bit(is64Bit)
+ , TargetType(isELF) { // Default to ELF unless otherwise specified.
+
+ // Determine default and user specified characteristics
+ if (!FS.empty()) {
+ // If feature string is not empty, parse features string.
+ std::string CPU = GetCurrentX86CPU();
+ ParseSubtargetFeatures(FS, CPU);
+ // All X86-64 CPUs also have SSE2, however user might request no SSE via
+ // -mattr, so don't force SSELevel here.
+ } else {
+ // Otherwise, use CPUID to auto-detect feature set.
+ AutoDetectSubtargetFeatures();
+ // Make sure SSE2 is enabled; it is available on all X86-64 CPUs.
+ if (Is64Bit && X86SSELevel < SSE2)
+ X86SSELevel = SSE2;
+ }
+
+ // If requesting codegen for X86-64, make sure that 64-bit features
+ // are enabled.
+ if (Is64Bit)
+ HasX86_64 = true;
+
+ DOUT << "Subtarget features: SSELevel " << X86SSELevel
+ << ", 3DNowLevel " << X863DNowLevel
+ << ", 64bit " << HasX86_64 << "\n";
+ assert((!Is64Bit || HasX86_64) &&
+ "64-bit code requested on a subtarget that doesn't support it!");
+
+ // Set the boolean corresponding to the current target triple, or the default
+ // if one cannot be determined, to true.
+ const std::string& TT = M.getTargetTriple();
+ if (TT.length() > 5) {
+ size_t Pos;
+ if ((Pos = TT.find("-darwin")) != std::string::npos) {
+ TargetType = isDarwin;
+
+ // Compute the darwin version number.
+ if (isdigit(TT[Pos+7]))
+ DarwinVers = atoi(&TT[Pos+7]);
+ else
+ DarwinVers = 8; // Minimum supported darwin is Tiger.
+ } else if (TT.find("linux") != std::string::npos) {
+ // Linux doesn't imply ELF, but we don't currently support anything else.
+ TargetType = isELF;
+ IsLinux = true;
+ } else if (TT.find("cygwin") != std::string::npos) {
+ TargetType = isCygwin;
+ } else if (TT.find("mingw") != std::string::npos) {
+ TargetType = isMingw;
+ } else if (TT.find("win32") != std::string::npos) {
+ TargetType = isWindows;
+ } else if (TT.find("windows") != std::string::npos) {
+ TargetType = isWindows;
+ }
+ else if (TT.find("-cl") != std::string::npos) {
+ TargetType = isDarwin;
+ DarwinVers = 9;
+ }
+ } else if (TT.empty()) {
+#if defined(__CYGWIN__)
+ TargetType = isCygwin;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+ TargetType = isMingw;
+#elif defined(__APPLE__)
+ TargetType = isDarwin;
+#if __APPLE_CC__ > 5400
+ DarwinVers = 9; // GCC 5400+ is Leopard.
+#else
+ DarwinVers = 8; // Minimum supported darwin is Tiger.
+#endif
+
+#elif defined(_WIN32) || defined(_WIN64)
+ TargetType = isWindows;
+#elif defined(__linux__)
+ // Linux doesn't imply ELF, but we don't currently support anything else.
+ TargetType = isELF;
+ IsLinux = true;
+#endif
+ }
+
+ // If the asm syntax hasn't been overridden on the command line, use whatever
+ // the target wants.
+ if (AsmFlavor == X86Subtarget::Unset) {
+ AsmFlavor = (TargetType == isWindows)
+ ? X86Subtarget::Intel : X86Subtarget::ATT;
+ }
+
+ // Stack alignment is 16 bytes on Darwin (both 32 and 64 bit) and for all 64
+ // bit targets.
+ if (TargetType == isDarwin || Is64Bit)
+ stackAlignment = 16;
+
+ if (StackAlignment)
+ stackAlignment = StackAlignment;
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..46476f2
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,224 @@
+//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+class Module;
+class GlobalValue;
+class TargetMachine;
+
+namespace PICStyles {
+enum Style {
+ Stub, GOT, RIPRel, WinPIC, None
+};
+}
+
+class X86Subtarget : public TargetSubtarget {
+public:
+ enum AsmWriterFlavorTy {
+ // Note: This numbering has to match the GCC assembler dialects for inline
+ // asm alternatives to work right.
+ ATT = 0, Intel = 1, Unset
+ };
+protected:
+ enum X86SSEEnum {
+ NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42
+ };
+
+ enum X863DNowEnum {
+ NoThreeDNow, ThreeDNow, ThreeDNowA
+ };
+
+ /// AsmFlavor - Which x86 asm dialect to use.
+ ///
+ AsmWriterFlavorTy AsmFlavor;
+
+ /// PICStyle - Which PIC style to use
+ ///
+ PICStyles::Style PICStyle;
+
+ /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
+ /// none supported.
+ X86SSEEnum X86SSELevel;
+
+ /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+ ///
+ X863DNowEnum X863DNowLevel;
+
+ /// HasX86_64 - True if the processor supports X86-64 instructions.
+ ///
+ bool HasX86_64;
+
+ /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
+ bool IsBTMemSlow;
+
+ /// HasSSE4A - True if the processor supports SSE4A instructions.
+ bool HasSSE4A;
+
+ /// DarwinVers - Nonzero if this is a darwin platform: the numeric
+ /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
+ unsigned char DarwinVers; // Is any darwin-x86 platform.
+
+ /// isLinux - true if this is a "linux" platform.
+ bool IsLinux;
+
+ /// stackAlignment - The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ unsigned stackAlignment;
+
+ /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+ ///
+ unsigned MaxInlineSizeThreshold;
+
+private:
+ /// Is64Bit - True if the processor supports 64-bit instructions and module
+ /// pointer size is 64 bit.
+ bool Is64Bit;
+
+public:
+ enum {
+ isELF, isCygwin, isDarwin, isWindows, isMingw
+ } TargetType;
+
+ /// This constructor initializes the data members to match that
+ /// of the specified module.
+ ///
+ X86Subtarget(const Module &M, const std::string &FS, bool is64Bit);
+
+ /// getStackAlignment - Returns the minimum alignment known to hold of the
+ /// stack frame on entry to the function and which must be maintained by every
+ /// function for this subtarget.
+ unsigned getStackAlignment() const { return stackAlignment; }
+
+ /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+ /// that still makes it profitable to inline the call.
+ unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ std::string ParseSubtargetFeatures(const std::string &FS,
+ const std::string &CPU);
+
+ /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
+ /// instruction.
+ void AutoDetectSubtargetFeatures();
+
+ bool is64Bit() const { return Is64Bit; }
+
+ PICStyles::Style getPICStyle() const { return PICStyle; }
+ void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
+
+ bool hasMMX() const { return X86SSELevel >= MMX; }
+ bool hasSSE1() const { return X86SSELevel >= SSE1; }
+ bool hasSSE2() const { return X86SSELevel >= SSE2; }
+ bool hasSSE3() const { return X86SSELevel >= SSE3; }
+ bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+ bool hasSSE41() const { return X86SSELevel >= SSE41; }
+ bool hasSSE42() const { return X86SSELevel >= SSE42; }
+ bool hasSSE4A() const { return HasSSE4A; }
+ bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+ bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+
+ bool isBTMemSlow() const { return IsBTMemSlow; }
+
+ unsigned getAsmFlavor() const {
+ return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0;
+ }
+
+ bool isFlavorAtt() const { return AsmFlavor == ATT; }
+ bool isFlavorIntel() const { return AsmFlavor == Intel; }
+
+ bool isTargetDarwin() const { return TargetType == isDarwin; }
+ bool isTargetELF() const {
+ return TargetType == isELF;
+ }
+ bool isTargetWindows() const { return TargetType == isWindows; }
+ bool isTargetMingw() const { return TargetType == isMingw; }
+ bool isTargetCygMing() const { return (TargetType == isMingw ||
+ TargetType == isCygwin); }
+ bool isTargetCygwin() const { return TargetType == isCygwin; }
+ bool isTargetWin64() const {
+ return (Is64Bit && (TargetType == isMingw || TargetType == isWindows));
+ }
+
+ std::string getDataLayout() const {
+ const char *p;
+ if (is64Bit())
+ p = "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128";
+ else {
+ if (isTargetDarwin())
+ p = "e-p:32:32-f64:32:64-i64:32:64-f80:128:128";
+ else
+ p = "e-p:32:32-f64:32:64-i64:32:64-f80:32:32";
+ }
+ return std::string(p);
+ }
+
+ bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
+ bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+ bool isPICStyleStub() const { return PICStyle == PICStyles::Stub; }
+ bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+ bool isPICStyleWinPIC() const { return PICStyle == PICStyles:: WinPIC; }
+
+ /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard.
+ unsigned getDarwinVers() const { return DarwinVers; }
+
+ /// isLinux - Return true if the target is "Linux".
+ bool isLinux() const { return IsLinux; }
+
+ /// True if accessing the GV requires an extra load. For Windows, dllimported
+ /// symbols are indirect, loading the value at address GV rather then the
+ /// value of GV itself. This means that the GlobalAddress must be in the base
+ /// or index register of the address, not the GV offset field.
+ bool GVRequiresExtraLoad(const GlobalValue* GV, const TargetMachine& TM,
+ bool isDirectCall) const;
+
+ /// True if accessing the GV requires a register. This is a superset of the
+ /// cases where GVRequiresExtraLoad is true. Some variations of PIC require
+ /// a register, but not an extra load.
+ bool GVRequiresRegister(const GlobalValue* GV, const TargetMachine& TM,
+ bool isDirectCall) const;
+
+ /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+ /// to immediate address.
+ bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+
+ /// This function returns the name of a function which has an interface
+ /// like the non-standard bzero function, if such a function exists on
+ /// the current subtarget and it is considered prefereable over
+ /// memset with zero passed as the second argument. Otherwise it
+ /// returns null.
+ const char *getBZeroEntry() const;
+
+ /// getSpecialAddressLatency - For targets where it is beneficial to
+ /// backschedule instructions that compute addresses, return a value
+ /// indicating the number of scheduling cycles of backscheduling that
+ /// should be attempted.
+ unsigned getSpecialAddressLatency() const;
+};
+
+namespace X86 {
+ /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+ /// the specified arguments. If we can't run cpuid on the host, return true.
+ bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+ unsigned *rECX, unsigned *rEDX);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86TargetAsmInfo.cpp b/lib/Target/X86/X86TargetAsmInfo.cpp
new file mode 100644
index 0000000..5dda5f4
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.cpp
@@ -0,0 +1,461 @@
+//===-- X86TargetAsmInfo.cpp - X86 asm properties ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86TargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86Subtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Dwarf.h"
+
+using namespace llvm;
+using namespace llvm::dwarf;
+
+const char *const llvm::x86_asm_table[] = {
+ "{si}", "S",
+ "{di}", "D",
+ "{ax}", "a",
+ "{cx}", "c",
+ "{memory}", "memory",
+ "{flags}", "",
+ "{dirflag}", "",
+ "{fpsr}", "",
+ "{cc}", "cc",
+ 0,0};
+
+X86DarwinTargetAsmInfo::X86DarwinTargetAsmInfo(const X86TargetMachine &TM):
+ X86TargetAsmInfo<DarwinTargetAsmInfo>(TM) {
+ const X86Subtarget* Subtarget = &TM.getSubtarget<X86Subtarget>();
+ bool is64Bit = Subtarget->is64Bit();
+
+ AlignmentIsInBytes = false;
+ TextAlignFillValue = 0x90;
+ GlobalPrefix = "_";
+ if (!is64Bit)
+ Data64bitsDirective = 0; // we can't emit a 64-bit unit
+ ZeroDirective = "\t.space\t"; // ".space N" emits N zeros.
+ PrivateGlobalPrefix = "L"; // Marker for constant pool idxs
+ LessPrivateGlobalPrefix = "l"; // Marker for some ObjC metadata
+ BSSSection = 0; // no BSS section.
+ ZeroFillDirective = "\t.zerofill\t"; // Uses .zerofill
+ if (TM.getRelocationModel() != Reloc::Static)
+ ConstantPoolSection = "\t.const_data";
+ else
+ ConstantPoolSection = "\t.const\n";
+ JumpTableDataSection = "\t.const\n";
+ CStringSection = "\t.cstring";
+ // FIXME: Why don't always use this section?
+ if (is64Bit) {
+ SixteenByteConstantSection = getUnnamedSection("\t.literal16\n",
+ SectionFlags::Mergeable);
+ }
+ LCOMMDirective = "\t.lcomm\t";
+ SwitchToSectionDirective = "\t.section ";
+ StringConstantPrefix = "\1LC";
+ // Leopard and above support aligned common symbols.
+ COMMDirectiveTakesAlignment = (Subtarget->getDarwinVers() >= 9);
+ HasDotTypeDotSizeDirective = false;
+ HasSingleParameterDotFile = false;
+ NonLocalEHFrameLabel = true;
+ if (TM.getRelocationModel() == Reloc::Static) {
+ StaticCtorsSection = ".constructor";
+ StaticDtorsSection = ".destructor";
+ } else {
+ StaticCtorsSection = ".mod_init_func";
+ StaticDtorsSection = ".mod_term_func";
+ }
+ if (is64Bit) {
+ PersonalityPrefix = "";
+ PersonalitySuffix = "+4@GOTPCREL";
+ } else {
+ PersonalityPrefix = "L";
+ PersonalitySuffix = "$non_lazy_ptr";
+ }
+ NeedsIndirectEncoding = true;
+ InlineAsmStart = "## InlineAsm Start";
+ InlineAsmEnd = "## InlineAsm End";
+ CommentString = "##";
+ SetDirective = "\t.set";
+ PCSymbol = ".";
+ UsedDirective = "\t.no_dead_strip\t";
+ WeakDefDirective = "\t.weak_definition ";
+ WeakRefDirective = "\t.weak_reference ";
+ HiddenDirective = "\t.private_extern ";
+ ProtectedDirective = "\t.globl\t";
+
+ // In non-PIC modes, emit a special label before jump tables so that the
+ // linker can perform more accurate dead code stripping.
+ if (TM.getRelocationModel() != Reloc::PIC_) {
+ // Emit a local label that is preserved until the linker runs.
+ JumpTableSpecialLabelPrefix = "l";
+ }
+
+ SupportsDebugInformation = true;
+ NeedsSet = true;
+ DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+ DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+ DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+ DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+ DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+ DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+ DwarfDebugInlineSection = ".section __DWARF,__debug_inlined,regular,debug";
+ DwarfUsesInlineInfoSection = true;
+ DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+ DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+ DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+ DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+ DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+
+ // Exceptions handling
+ SupportsExceptionHandling = true;
+ GlobalEHDirective = "\t.globl\t";
+ SupportsWeakOmittedEHFrame = false;
+ AbsoluteEHSectionOffsets = false;
+ DwarfEHFrameSection =
+ ".section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support";
+ DwarfExceptionSection = ".section __DATA,__gcc_except_tab";
+}
+
+unsigned
+X86DarwinTargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+ bool Global) const {
+ if (Reason == DwarfEncoding::Functions && Global)
+ return (DW_EH_PE_pcrel | DW_EH_PE_indirect | DW_EH_PE_sdata4);
+ else if (Reason == DwarfEncoding::CodeLabels || !Global)
+ return DW_EH_PE_pcrel;
+ else
+ return DW_EH_PE_absptr;
+}
+
+const char *
+X86DarwinTargetAsmInfo::getEHGlobalPrefix() const
+{
+ const X86Subtarget* Subtarget = &TM.getSubtarget<X86Subtarget>();
+ if (Subtarget->getDarwinVers() > 9)
+ return PrivateGlobalPrefix;
+ else
+ return "";
+}
+
+X86ELFTargetAsmInfo::X86ELFTargetAsmInfo(const X86TargetMachine &TM):
+ X86TargetAsmInfo<ELFTargetAsmInfo>(TM) {
+
+ CStringSection = ".rodata.str";
+ PrivateGlobalPrefix = ".L";
+ WeakRefDirective = "\t.weak\t";
+ SetDirective = "\t.set\t";
+ PCSymbol = ".";
+
+ // Set up DWARF directives
+ HasLEB128 = true; // Target asm supports leb128 directives (little-endian)
+
+ // Debug Information
+ AbsoluteDebugSectionOffsets = true;
+ SupportsDebugInformation = true;
+ DwarfAbbrevSection = "\t.section\t.debug_abbrev,\"\",@progbits";
+ DwarfInfoSection = "\t.section\t.debug_info,\"\",@progbits";
+ DwarfLineSection = "\t.section\t.debug_line,\"\",@progbits";
+ DwarfFrameSection = "\t.section\t.debug_frame,\"\",@progbits";
+ DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",@progbits";
+ DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",@progbits";
+ DwarfStrSection = "\t.section\t.debug_str,\"\",@progbits";
+ DwarfLocSection = "\t.section\t.debug_loc,\"\",@progbits";
+ DwarfARangesSection = "\t.section\t.debug_aranges,\"\",@progbits";
+ DwarfRangesSection = "\t.section\t.debug_ranges,\"\",@progbits";
+ DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",@progbits";
+
+ // Exceptions handling
+ SupportsExceptionHandling = true;
+ AbsoluteEHSectionOffsets = false;
+ DwarfEHFrameSection = "\t.section\t.eh_frame,\"aw\",@progbits";
+ DwarfExceptionSection = "\t.section\t.gcc_except_table,\"a\",@progbits";
+
+ // On Linux we must declare when we can use a non-executable stack.
+ if (TM.getSubtarget<X86Subtarget>().isLinux())
+ NonexecutableStackDirective = "\t.section\t.note.GNU-stack,\"\",@progbits";
+}
+
+unsigned
+X86ELFTargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+ bool Global) const {
+ CodeModel::Model CM = TM.getCodeModel();
+ bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ unsigned Format = 0;
+
+ if (!is64Bit)
+ // 32 bit targets always encode pointers as 4 bytes
+ Format = DW_EH_PE_sdata4;
+ else {
+ // 64 bit targets encode pointers in 4 bytes iff:
+ // - code model is small OR
+ // - code model is medium and we're emitting externally visible symbols
+ // or any code symbols
+ if (CM == CodeModel::Small ||
+ (CM == CodeModel::Medium && (Global ||
+ Reason != DwarfEncoding::Data)))
+ Format = DW_EH_PE_sdata4;
+ else
+ Format = DW_EH_PE_sdata8;
+ }
+
+ if (Global)
+ Format |= DW_EH_PE_indirect;
+
+ return (Format | DW_EH_PE_pcrel);
+ } else {
+ if (is64Bit &&
+ (CM == CodeModel::Small ||
+ (CM == CodeModel::Medium && Reason != DwarfEncoding::Data)))
+ return DW_EH_PE_udata4;
+ else
+ return DW_EH_PE_absptr;
+ }
+}
+
+X86COFFTargetAsmInfo::X86COFFTargetAsmInfo(const X86TargetMachine &TM):
+ X86GenericTargetAsmInfo(TM) {
+
+ GlobalPrefix = "_";
+ LCOMMDirective = "\t.lcomm\t";
+ COMMDirectiveTakesAlignment = false;
+ HasDotTypeDotSizeDirective = false;
+ HasSingleParameterDotFile = false;
+ StaticCtorsSection = "\t.section .ctors,\"aw\"";
+ StaticDtorsSection = "\t.section .dtors,\"aw\"";
+ HiddenDirective = NULL;
+ PrivateGlobalPrefix = "L"; // Prefix for private global symbols
+ WeakRefDirective = "\t.weak\t";
+ SetDirective = "\t.set\t";
+
+ // Set up DWARF directives
+ HasLEB128 = true; // Target asm supports leb128 directives (little-endian)
+ AbsoluteDebugSectionOffsets = true;
+ AbsoluteEHSectionOffsets = false;
+ SupportsDebugInformation = true;
+ DwarfSectionOffsetDirective = "\t.secrel32\t";
+ DwarfAbbrevSection = "\t.section\t.debug_abbrev,\"dr\"";
+ DwarfInfoSection = "\t.section\t.debug_info,\"dr\"";
+ DwarfLineSection = "\t.section\t.debug_line,\"dr\"";
+ DwarfFrameSection = "\t.section\t.debug_frame,\"dr\"";
+ DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"dr\"";
+ DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"dr\"";
+ DwarfStrSection = "\t.section\t.debug_str,\"dr\"";
+ DwarfLocSection = "\t.section\t.debug_loc,\"dr\"";
+ DwarfARangesSection = "\t.section\t.debug_aranges,\"dr\"";
+ DwarfRangesSection = "\t.section\t.debug_ranges,\"dr\"";
+ DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"dr\"";
+}
+
+unsigned
+X86COFFTargetAsmInfo::PreferredEHDataFormat(DwarfEncoding::Target Reason,
+ bool Global) const {
+ CodeModel::Model CM = TM.getCodeModel();
+ bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ unsigned Format = 0;
+
+ if (!is64Bit)
+ // 32 bit targets always encode pointers as 4 bytes
+ Format = DW_EH_PE_sdata4;
+ else {
+ // 64 bit targets encode pointers in 4 bytes iff:
+ // - code model is small OR
+ // - code model is medium and we're emitting externally visible symbols
+ // or any code symbols
+ if (CM == CodeModel::Small ||
+ (CM == CodeModel::Medium && (Global ||
+ Reason != DwarfEncoding::Data)))
+ Format = DW_EH_PE_sdata4;
+ else
+ Format = DW_EH_PE_sdata8;
+ }
+
+ if (Global)
+ Format |= DW_EH_PE_indirect;
+
+ return (Format | DW_EH_PE_pcrel);
+ } else {
+ if (is64Bit &&
+ (CM == CodeModel::Small ||
+ (CM == CodeModel::Medium && Reason != DwarfEncoding::Data)))
+ return DW_EH_PE_udata4;
+ else
+ return DW_EH_PE_absptr;
+ }
+}
+
+std::string
+X86COFFTargetAsmInfo::UniqueSectionForGlobal(const GlobalValue* GV,
+ SectionKind::Kind kind) const {
+ switch (kind) {
+ case SectionKind::Text:
+ return ".text$linkonce" + GV->getName();
+ case SectionKind::Data:
+ case SectionKind::BSS:
+ case SectionKind::ThreadData:
+ case SectionKind::ThreadBSS:
+ return ".data$linkonce" + GV->getName();
+ case SectionKind::ROData:
+ case SectionKind::RODataMergeConst:
+ case SectionKind::RODataMergeStr:
+ return ".rdata$linkonce" + GV->getName();
+ default:
+ assert(0 && "Unknown section kind");
+ }
+ return NULL;
+}
+
+std::string X86COFFTargetAsmInfo::printSectionFlags(unsigned flags) const {
+ std::string Flags = ",\"";
+
+ if (flags & SectionFlags::Code)
+ Flags += 'x';
+ if (flags & SectionFlags::Writeable)
+ Flags += 'w';
+
+ Flags += "\"";
+
+ return Flags;
+}
+
+X86WinTargetAsmInfo::X86WinTargetAsmInfo(const X86TargetMachine &TM):
+ X86GenericTargetAsmInfo(TM) {
+ GlobalPrefix = "_";
+ CommentString = ";";
+
+ PrivateGlobalPrefix = "$";
+ AlignDirective = "\talign\t";
+ ZeroDirective = "\tdb\t";
+ ZeroDirectiveSuffix = " dup(0)";
+ AsciiDirective = "\tdb\t";
+ AscizDirective = 0;
+ Data8bitsDirective = "\tdb\t";
+ Data16bitsDirective = "\tdw\t";
+ Data32bitsDirective = "\tdd\t";
+ Data64bitsDirective = "\tdq\t";
+ HasDotTypeDotSizeDirective = false;
+ HasSingleParameterDotFile = false;
+
+ TextSection = getUnnamedSection("_text", SectionFlags::Code);
+ DataSection = getUnnamedSection("_data", SectionFlags::Writeable);
+
+ JumpTableDataSection = NULL;
+ SwitchToSectionDirective = "";
+ TextSectionStartSuffix = "\tsegment 'CODE'";
+ DataSectionStartSuffix = "\tsegment 'DATA'";
+ SectionEndDirectiveSuffix = "\tends\n";
+}
+
+template <class BaseTAI>
+bool X86TargetAsmInfo<BaseTAI>::LowerToBSwap(CallInst *CI) const {
+ // FIXME: this should verify that we are targetting a 486 or better. If not,
+ // we will turn this bswap into something that will be lowered to logical ops
+ // instead of emitting the bswap asm. For now, we don't support 486 or lower
+ // so don't worry about this.
+
+ // Verify this is a simple bswap.
+ if (CI->getNumOperands() != 2 ||
+ CI->getType() != CI->getOperand(1)->getType() ||
+ !CI->getType()->isInteger())
+ return false;
+
+ const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (!Ty || Ty->getBitWidth() % 16 != 0)
+ return false;
+
+ // Okay, we can do this xform, do so now.
+ const Type *Tys[] = { Ty };
+ Module *M = CI->getParent()->getParent()->getParent();
+ Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
+
+ Value *Op = CI->getOperand(1);
+ Op = CallInst::Create(Int, Op, CI->getName(), CI);
+
+ CI->replaceAllUsesWith(Op);
+ CI->eraseFromParent();
+ return true;
+}
+
+template <class BaseTAI>
+bool X86TargetAsmInfo<BaseTAI>::ExpandInlineAsm(CallInst *CI) const {
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+ std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
+
+ std::string AsmStr = IA->getAsmString();
+
+ // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+ std::vector<std::string> AsmPieces;
+ SplitString(AsmStr, AsmPieces, "\n"); // ; as separator?
+
+ switch (AsmPieces.size()) {
+ default: return false;
+ case 1:
+ AsmStr = AsmPieces[0];
+ AsmPieces.clear();
+ SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace.
+
+ // bswap $0
+ if (AsmPieces.size() == 2 &&
+ (AsmPieces[0] == "bswap" ||
+ AsmPieces[0] == "bswapq" ||
+ AsmPieces[0] == "bswapl") &&
+ (AsmPieces[1] == "$0" ||
+ AsmPieces[1] == "${0:q}")) {
+ // No need to check constraints, nothing other than the equivalent of
+ // "=r,0" would be valid here.
+ return LowerToBSwap(CI);
+ }
+ // rorw $$8, ${0:w} --> llvm.bswap.i16
+ if (CI->getType() == Type::Int16Ty &&
+ AsmPieces.size() == 3 &&
+ AsmPieces[0] == "rorw" &&
+ AsmPieces[1] == "$$8," &&
+ AsmPieces[2] == "${0:w}" &&
+ IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") {
+ return LowerToBSwap(CI);
+ }
+ break;
+ case 3:
+ if (CI->getType() == Type::Int64Ty && Constraints.size() >= 2 &&
+ Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+ Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+ // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
+ std::vector<std::string> Words;
+ SplitString(AsmPieces[0], Words, " \t");
+ if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
+ Words.clear();
+ SplitString(AsmPieces[1], Words, " \t");
+ if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
+ Words.clear();
+ SplitString(AsmPieces[2], Words, " \t,");
+ if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
+ Words[2] == "%edx") {
+ return LowerToBSwap(CI);
+ }
+ }
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+// Instantiate default implementation.
+TEMPLATE_INSTANTIATION(class X86TargetAsmInfo<TargetAsmInfo>);
diff --git a/lib/Target/X86/X86TargetAsmInfo.h b/lib/Target/X86/X86TargetAsmInfo.h
new file mode 100644
index 0000000..f89171d
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.h
@@ -0,0 +1,75 @@
+//=====-- X86TargetAsmInfo.h - X86 asm properties -------------*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86TargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "X86TargetMachine.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/ELFTargetAsmInfo.h"
+#include "llvm/Target/DarwinTargetAsmInfo.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+ extern const char *const x86_asm_table[];
+
+ template <class BaseTAI>
+ struct X86TargetAsmInfo : public BaseTAI {
+ explicit X86TargetAsmInfo(const X86TargetMachine &TM):
+ BaseTAI(TM) {
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+
+ BaseTAI::AsmTransCBE = x86_asm_table;
+ BaseTAI::AssemblerDialect = Subtarget->getAsmFlavor();
+ }
+
+ virtual bool ExpandInlineAsm(CallInst *CI) const;
+
+ private:
+ bool LowerToBSwap(CallInst *CI) const;
+ };
+
+ typedef X86TargetAsmInfo<TargetAsmInfo> X86GenericTargetAsmInfo;
+
+ EXTERN_TEMPLATE_INSTANTIATION(class X86TargetAsmInfo<TargetAsmInfo>);
+
+ struct X86DarwinTargetAsmInfo : public X86TargetAsmInfo<DarwinTargetAsmInfo> {
+ explicit X86DarwinTargetAsmInfo(const X86TargetMachine &TM);
+ virtual unsigned PreferredEHDataFormat(DwarfEncoding::Target Reason,
+ bool Global) const;
+ virtual const char *getEHGlobalPrefix() const;
+ };
+
+ struct X86ELFTargetAsmInfo : public X86TargetAsmInfo<ELFTargetAsmInfo> {
+ explicit X86ELFTargetAsmInfo(const X86TargetMachine &TM);
+ virtual unsigned PreferredEHDataFormat(DwarfEncoding::Target Reason,
+ bool Global) const;
+ };
+
+ struct X86COFFTargetAsmInfo : public X86GenericTargetAsmInfo {
+ explicit X86COFFTargetAsmInfo(const X86TargetMachine &TM);
+ virtual unsigned PreferredEHDataFormat(DwarfEncoding::Target Reason,
+ bool Global) const;
+ virtual std::string UniqueSectionForGlobal(const GlobalValue* GV,
+ SectionKind::Kind kind) const;
+ virtual std::string printSectionFlags(unsigned flags) const;
+ };
+
+ struct X86WinTargetAsmInfo : public X86GenericTargetAsmInfo {
+ explicit X86WinTargetAsmInfo(const X86TargetMachine &TM);
+ };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..8264462
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,317 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+/// X86TargetMachineModule - Note that this is used on hosts that cannot link
+/// in a library unless there are references into the library. In particular,
+/// it seems that it is not possible to get things to work on Win32 without
+/// this. Though it is unused, do not remove it.
+extern "C" int X86TargetMachineModule;
+int X86TargetMachineModule = 0;
+
+// Register the target.
+static RegisterTarget<X86_32TargetMachine>
+X("x86", "32-bit X86: Pentium-Pro and above");
+static RegisterTarget<X86_64TargetMachine>
+Y("x86-64", "64-bit X86: EM64T and AMD64");
+
+// No assembler printer by default
+X86TargetMachine::AsmPrinterCtorFn X86TargetMachine::AsmPrinterCtor = 0;
+
+const TargetAsmInfo *X86TargetMachine::createTargetAsmInfo() const {
+ if (Subtarget.isFlavorIntel())
+ return new X86WinTargetAsmInfo(*this);
+ else
+ switch (Subtarget.TargetType) {
+ case X86Subtarget::isDarwin:
+ return new X86DarwinTargetAsmInfo(*this);
+ case X86Subtarget::isELF:
+ return new X86ELFTargetAsmInfo(*this);
+ case X86Subtarget::isMingw:
+ case X86Subtarget::isCygwin:
+ return new X86COFFTargetAsmInfo(*this);
+ case X86Subtarget::isWindows:
+ return new X86WinTargetAsmInfo(*this);
+ default:
+ return new X86GenericTargetAsmInfo(*this);
+ }
+}
+
+unsigned X86_32TargetMachine::getJITMatchQuality() {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+ return 10;
+#endif
+ return 0;
+}
+
+unsigned X86_64TargetMachine::getJITMatchQuality() {
+#if defined(__x86_64__) || defined(_M_AMD64)
+ return 10;
+#endif
+ return 0;
+}
+
+unsigned X86_32TargetMachine::getModuleMatchQuality(const Module &M) {
+ // We strongly match "i[3-9]86-*".
+ std::string TT = M.getTargetTriple();
+ if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' &&
+ TT[4] == '-' && TT[1] - '3' < 6)
+ return 20;
+ // If the target triple is something non-X86, we don't match.
+ if (!TT.empty()) return 0;
+
+ if (M.getEndianness() == Module::LittleEndian &&
+ M.getPointerSize() == Module::Pointer32)
+ return 10; // Weak match
+ else if (M.getEndianness() != Module::AnyEndianness ||
+ M.getPointerSize() != Module::AnyPointerSize)
+ return 0; // Match for some other target
+
+ return getJITMatchQuality()/2;
+}
+
+unsigned X86_64TargetMachine::getModuleMatchQuality(const Module &M) {
+ // We strongly match "x86_64-*".
+ std::string TT = M.getTargetTriple();
+ if (TT.size() >= 7 && TT[0] == 'x' && TT[1] == '8' && TT[2] == '6' &&
+ TT[3] == '_' && TT[4] == '6' && TT[5] == '4' && TT[6] == '-')
+ return 20;
+
+ // We strongly match "amd64-*".
+ if (TT.size() >= 6 && TT[0] == 'a' && TT[1] == 'm' && TT[2] == 'd' &&
+ TT[3] == '6' && TT[4] == '4' && TT[5] == '-')
+ return 20;
+
+ // If the target triple is something non-X86-64, we don't match.
+ if (!TT.empty()) return 0;
+
+ if (M.getEndianness() == Module::LittleEndian &&
+ M.getPointerSize() == Module::Pointer64)
+ return 10; // Weak match
+ else if (M.getEndianness() != Module::AnyEndianness ||
+ M.getPointerSize() != Module::AnyPointerSize)
+ return 0; // Match for some other target
+
+ return getJITMatchQuality()/2;
+}
+
+X86_32TargetMachine::X86_32TargetMachine(const Module &M, const std::string &FS)
+ : X86TargetMachine(M, FS, false) {
+}
+
+
+X86_64TargetMachine::X86_64TargetMachine(const Module &M, const std::string &FS)
+ : X86TargetMachine(M, FS, true) {
+}
+
+/// X86TargetMachine ctor - Create an ILP32 architecture model
+///
+X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS,
+ bool is64Bit)
+ : Subtarget(M, FS, is64Bit),
+ DataLayout(Subtarget.getDataLayout()),
+ FrameInfo(TargetFrameInfo::StackGrowsDown,
+ Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4),
+ InstrInfo(*this), JITInfo(*this), TLInfo(*this) {
+ DefRelocModel = getRelocationModel();
+ // FIXME: Correctly select PIC model for Win64 stuff
+ if (getRelocationModel() == Reloc::Default) {
+ if (Subtarget.isTargetDarwin() ||
+ (Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64()))
+ setRelocationModel(Reloc::DynamicNoPIC);
+ else
+ setRelocationModel(Reloc::Static);
+ }
+
+ // ELF doesn't have a distinct dynamic-no-PIC model. Dynamic-no-PIC
+ // is defined as a model for code which may be used in static or
+ // dynamic executables but not necessarily a shared library. On ELF
+ // implement this by using the Static model.
+ if (Subtarget.isTargetELF() &&
+ getRelocationModel() == Reloc::DynamicNoPIC)
+ setRelocationModel(Reloc::Static);
+
+ if (Subtarget.is64Bit()) {
+ // No DynamicNoPIC support under X86-64.
+ if (getRelocationModel() == Reloc::DynamicNoPIC)
+ setRelocationModel(Reloc::PIC_);
+ // Default X86-64 code model is small.
+ if (getCodeModel() == CodeModel::Default)
+ setCodeModel(CodeModel::Small);
+ }
+
+ if (Subtarget.isTargetCygMing())
+ Subtarget.setPICStyle(PICStyles::WinPIC);
+ else if (Subtarget.isTargetDarwin()) {
+ if (Subtarget.is64Bit())
+ Subtarget.setPICStyle(PICStyles::RIPRel);
+ else
+ Subtarget.setPICStyle(PICStyles::Stub);
+ } else if (Subtarget.isTargetELF()) {
+ if (Subtarget.is64Bit())
+ Subtarget.setPICStyle(PICStyles::RIPRel);
+ else
+ Subtarget.setPICStyle(PICStyles::GOT);
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool X86TargetMachine::addInstSelector(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel) {
+ // Install an instruction selector.
+ PM.add(createX86ISelDag(*this, OptLevel));
+
+ // If we're using Fast-ISel, clean up the mess.
+ if (EnableFastISel)
+ PM.add(createDeadMachineInstructionElimPass());
+
+ // Install a pass to insert x87 FP_REG_KILL instructions, as needed.
+ PM.add(createX87FPRegKillInserterPass());
+
+ return false;
+}
+
+bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel) {
+ // Calculate and set max stack object alignment early, so we can decide
+ // whether we will need stack realignment (and thus FP).
+ PM.add(createX86MaxStackAlignmentCalculatorPass());
+ return false; // -print-machineinstr shouldn't print after this.
+}
+
+bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel) {
+ PM.add(createX86FloatingPointStackifierPass());
+ return true; // -print-machineinstr should print after this.
+}
+
+bool X86TargetMachine::addAssemblyEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ bool Verbose,
+ raw_ostream &Out) {
+ assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+ if (AsmPrinterCtor)
+ PM.add(AsmPrinterCtor(Out, *this, OptLevel, Verbose));
+ return false;
+}
+
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ bool DumpAsm,
+ MachineCodeEmitter &MCE) {
+ // FIXME: Move this to TargetJITInfo!
+ // On Darwin, do not override 64-bit setting made in X86TargetMachine().
+ if (DefRelocModel == Reloc::Default &&
+ (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit()))
+ setRelocationModel(Reloc::Static);
+
+ // 64-bit JIT places everything in the same buffer except external functions.
+ // On Darwin, use small code model but hack the call instruction for
+ // externals. Elsewhere, do not assume globals are in the lower 4G.
+ if (Subtarget.is64Bit()) {
+ if (Subtarget.isTargetDarwin())
+ setCodeModel(CodeModel::Small);
+ else
+ setCodeModel(CodeModel::Large);
+ }
+
+ PM.add(createX86CodeEmitterPass(*this, MCE));
+ if (DumpAsm) {
+ assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+ if (AsmPrinterCtor)
+ PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+ }
+
+ return false;
+}
+
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ bool DumpAsm,
+ JITCodeEmitter &JCE) {
+ // FIXME: Move this to TargetJITInfo!
+ // On Darwin, do not override 64-bit setting made in X86TargetMachine().
+ if (DefRelocModel == Reloc::Default &&
+ (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit()))
+ setRelocationModel(Reloc::Static);
+
+ // 64-bit JIT places everything in the same buffer except external functions.
+ // On Darwin, use small code model but hack the call instruction for
+ // externals. Elsewhere, do not assume globals are in the lower 4G.
+ if (Subtarget.is64Bit()) {
+ if (Subtarget.isTargetDarwin())
+ setCodeModel(CodeModel::Small);
+ else
+ setCodeModel(CodeModel::Large);
+ }
+
+ PM.add(createX86JITCodeEmitterPass(*this, JCE));
+ if (DumpAsm) {
+ assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+ if (AsmPrinterCtor)
+ PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+ }
+
+ return false;
+}
+
+bool X86TargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ bool DumpAsm,
+ MachineCodeEmitter &MCE) {
+ PM.add(createX86CodeEmitterPass(*this, MCE));
+ if (DumpAsm) {
+ assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+ if (AsmPrinterCtor)
+ PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+ }
+
+ return false;
+}
+
+bool X86TargetMachine::addSimpleCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ bool DumpAsm,
+ JITCodeEmitter &JCE) {
+ PM.add(createX86JITCodeEmitterPass(*this, JCE));
+ if (DumpAsm) {
+ assert(AsmPrinterCtor && "AsmPrinter was not linked in");
+ if (AsmPrinterCtor)
+ PM.add(AsmPrinterCtor(errs(), *this, OptLevel, true));
+ }
+
+ return false;
+}
+
+/// symbolicAddressesAreRIPRel - Return true if symbolic addresses are
+/// RIP-relative on this machine, taking into consideration the relocation
+/// model and subtarget. RIP-relative addresses cannot have a separate
+/// base or index register.
+bool X86TargetMachine::symbolicAddressesAreRIPRel() const {
+ return getRelocationModel() != Reloc::Static &&
+ Subtarget.isPICStyleRIPRel();
+}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..ecc1d39
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,124 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "X86.h"
+#include "X86ELFWriterInfo.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86ISelLowering.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class X86TargetMachine : public LLVMTargetMachine {
+ X86Subtarget Subtarget;
+ const TargetData DataLayout; // Calculates type size & alignment
+ TargetFrameInfo FrameInfo;
+ X86InstrInfo InstrInfo;
+ X86JITInfo JITInfo;
+ X86TargetLowering TLInfo;
+ X86ELFWriterInfo ELFWriterInfo;
+ Reloc::Model DefRelocModel; // Reloc model before it's overridden.
+
+protected:
+ virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+ // To avoid having target depend on the asmprinter stuff libraries, asmprinter
+ // set this functions to ctor pointer at startup time if they are linked in.
+ typedef FunctionPass *(*AsmPrinterCtorFn)(raw_ostream &o,
+ X86TargetMachine &tm,
+ CodeGenOpt::Level OptLevel,
+ bool verbose);
+ static AsmPrinterCtorFn AsmPrinterCtor;
+
+public:
+ X86TargetMachine(const Module &M, const std::string &FS, bool is64Bit);
+
+ virtual const X86InstrInfo *getInstrInfo() const { return &InstrInfo; }
+ virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+ virtual X86JITInfo *getJITInfo() { return &JITInfo; }
+ virtual const X86Subtarget *getSubtargetImpl() const{ return &Subtarget; }
+ virtual X86TargetLowering *getTargetLowering() const {
+ return const_cast<X86TargetLowering*>(&TLInfo);
+ }
+ virtual const X86RegisterInfo *getRegisterInfo() const {
+ return &InstrInfo.getRegisterInfo();
+ }
+ virtual const TargetData *getTargetData() const { return &DataLayout; }
+ virtual const X86ELFWriterInfo *getELFWriterInfo() const {
+ return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+ }
+
+ static unsigned getModuleMatchQuality(const Module &M);
+ static unsigned getJITMatchQuality();
+
+ static void registerAsmPrinter(AsmPrinterCtorFn F) {
+ AsmPrinterCtor = F;
+ }
+
+ // Set up the pass pipeline.
+ virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+ virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+ virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+ virtual bool addAssemblyEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ bool Verbose, raw_ostream &Out);
+ virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+ bool DumpAsm, MachineCodeEmitter &MCE);
+ virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+ bool DumpAsm, JITCodeEmitter &JCE);
+ virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ bool DumpAsm, MachineCodeEmitter &MCE);
+ virtual bool addSimpleCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ bool DumpAsm, JITCodeEmitter &JCE);
+
+ /// symbolicAddressesAreRIPRel - Return true if symbolic addresses are
+ /// RIP-relative on this machine, taking into consideration the relocation
+ /// model and subtarget. RIP-relative addresses cannot have a separate
+ /// base or index register.
+ bool symbolicAddressesAreRIPRel() const;
+};
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+public:
+ X86_32TargetMachine(const Module &M, const std::string &FS);
+
+ static unsigned getJITMatchQuality();
+ static unsigned getModuleMatchQuality(const Module &M);
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+public:
+ X86_64TargetMachine(const Module &M, const std::string &FS);
+
+ static unsigned getJITMatchQuality();
+ static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // End llvm namespace
+
+#endif
OpenPOWER on IntegriCloud