summaryrefslogtreecommitdiffstats
path: root/lib/Target/X86
diff options
context:
space:
mode:
authorrdivacky <rdivacky@FreeBSD.org>2010-03-03 17:27:15 +0000
committerrdivacky <rdivacky@FreeBSD.org>2010-03-03 17:27:15 +0000
commit8230c40430a1325b5cc5bc0221931487b4bd573c (patch)
tree836a05cff50ca46176117b86029f061fa4db54f0 /lib/Target/X86
parentf25ddd991a5601d0101602c4c263a58c7af4b8a2 (diff)
downloadFreeBSD-src-8230c40430a1325b5cc5bc0221931487b4bd573c.zip
FreeBSD-src-8230c40430a1325b5cc5bc0221931487b4bd573c.tar.gz
Update LLVM to 97654.
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp2
-rw-r--r--lib/Target/X86/CMakeLists.txt1
-rw-r--r--lib/Target/X86/README-SSE.txt8
-rw-r--r--lib/Target/X86/README.txt5
-rw-r--r--lib/Target/X86/X86.h11
-rw-r--r--lib/Target/X86/X86AsmBackend.cpp34
-rw-r--r--lib/Target/X86/X86FastISel.cpp2
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp383
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp286
-rw-r--r--lib/Target/X86/X86ISelLowering.h2
-rw-r--r--lib/Target/X86/X86Instr64bit.td36
-rw-r--r--lib/Target/X86/X86InstrFPStack.td2
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp22
-rw-r--r--lib/Target/X86/X86InstrInfo.td173
-rw-r--r--lib/Target/X86/X86InstrMMX.td17
-rw-r--r--lib/Target/X86/X86InstrSSE.td416
-rw-r--r--lib/Target/X86/X86MCAsmInfo.cpp11
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp10
-rw-r--r--lib/Target/X86/X86RegisterInfo.h3
-rw-r--r--lib/Target/X86/X86RegisterInfo.td63
-rw-r--r--lib/Target/X86/X86Subtarget.h23
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp6
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp64
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h22
24 files changed, 702 insertions, 900 deletions
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
index 1a35a49..734a545 100644
--- a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
@@ -73,7 +73,7 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo) {
O << '$' << Op.getImm();
if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
- *CommentStream << format("imm = 0x%X\n", Op.getImm());
+ *CommentStream << format("imm = 0x%llX\n", (long long)Op.getImm());
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 61f26a7..eed3b45 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(X86GenCallingConv.inc -gen-callingconv)
tablegen(X86GenSubtarget.inc -gen-subtarget)
set(sources
+ X86AsmBackend.cpp
X86CodeEmitter.cpp
X86COFFMachineModuleInfo.cpp
X86ELFWriterInfo.cpp
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 19eb05e..e5f84e8 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -67,8 +67,8 @@ no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
[ %tmp.34.i18, %no_exit.i7 ]
%tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
[ %tmp.28.i16, %no_exit.i7 ]
- %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
- %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
+ %tmp.28.i16 = fadd double %tmp.0.0.0.i10, 0.000000e+00
+ %tmp.34.i18 = fadd double %tmp.0.1.0.i9, 0.000000e+00
br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
Compute_Tree.exit23: ; preds = %no_exit.i7
@@ -97,7 +97,7 @@ pcmp/pand/pandn/por to do a selection instead of a conditional branch:
double %X(double %Y, double %Z, double %A, double %B) {
%C = setlt double %A, %B
- %z = add double %Z, 0.0 ;; select operand is not a load
+ %z = fadd double %Z, 0.0 ;; select operand is not a load
%D = select bool %C, double %Y, double %z
ret double %D
}
@@ -545,7 +545,7 @@ eliminates a constant pool load. For example, consider:
define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
entry:
- %tmp6 = sub float -0.000000e+00, %z.1 ; <float> [#uses=1]
+ %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
%tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
ret i64 %tmp20
}
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 3c6138b..d4545a6 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -227,11 +227,6 @@ lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
//===---------------------------------------------------------------------===//
-Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
-FR64 to VR128.
-
-//===---------------------------------------------------------------------===//
-
Adding to the list of cmp / test poor codegen issues:
int test(__m128 *A, __m128 *B) {
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 1a1e447..ba0ee6c 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -19,13 +19,15 @@
namespace llvm {
-class X86TargetMachine;
class FunctionPass;
-class MachineCodeEmitter;
+class JITCodeEmitter;
+class MCAssembler;
class MCCodeEmitter;
class MCContext;
-class JITCodeEmitter;
+class MachineCodeEmitter;
class Target;
+class TargetAsmBackend;
+class X86TargetMachine;
class formatted_raw_ostream;
/// createX86ISelDag - This pass converts a legalized DAG into a
@@ -55,6 +57,9 @@ MCCodeEmitter *createX86_32MCCodeEmitter(const Target &, TargetMachine &TM,
MCCodeEmitter *createX86_64MCCodeEmitter(const Target &, TargetMachine &TM,
MCContext &Ctx);
+TargetAsmBackend *createX86_32AsmBackend(const Target &, MCAssembler &);
+TargetAsmBackend *createX86_64AsmBackend(const Target &, MCAssembler &);
+
/// createX86EmitCodeToMemory - Returns a pass that converts a register
/// allocated function into raw machine code in a dynamically
/// allocated chunk of memory.
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp
new file mode 100644
index 0000000..e6654ef
--- /dev/null
+++ b/lib/Target/X86/X86AsmBackend.cpp
@@ -0,0 +1,34 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmBackend.h"
+#include "X86.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetAsmBackend.h"
+using namespace llvm;
+
+namespace {
+
+class X86AsmBackend : public TargetAsmBackend {
+public:
+ X86AsmBackend(const Target &T, MCAssembler &A)
+ : TargetAsmBackend(T) {}
+};
+
+}
+
+TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+ MCAssembler &A) {
+ return new X86AsmBackend(T, A);
+}
+
+TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+ MCAssembler &A) {
+ return new X86AsmBackend(T, A);
+}
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 69a9d60..17366ee 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1161,6 +1161,8 @@ bool X86FastISel::X86VisitIntrinsicCall(IntrinsicInst &I) {
if (!X86SelectAddress(DI->getAddress(), AM))
return false;
const TargetInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+ // FIXME may need to add RegState::Debug to any registers produced,
+ // although ESP/EBP should be the only ones at the moment.
addFullAddress(BuildMI(MBB, DL, II), AM).addImm(0).
addMetadata(DI->getVariable());
return true;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 7b349f6..08030e0 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -12,15 +12,6 @@
//
//===----------------------------------------------------------------------===//
-// Force NDEBUG on in any optimized build on Darwin.
-//
-// FIXME: This is a huge hack, to work around ridiculously awful compile times
-// on this file with gcc-4.2 on Darwin, in Release mode.
-#if (!defined(__llvm__) && defined(__APPLE__) && \
- defined(__OPTIMIZE__) && !defined(NDEBUG))
-#define NDEBUG
-#endif
-
#define DEBUG_TYPE "x86-isel"
#include "X86.h"
#include "X86InstrBuilder.h"
@@ -177,15 +168,11 @@ namespace {
return "X86 DAG->DAG Instruction Selection";
}
- /// InstructionSelect - This callback is invoked by
- /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
- virtual void InstructionSelect();
-
virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);
virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const;
- virtual bool IsLegalToFold(SDValue N, SDNode *U, SDNode *Root) const;
+ virtual void PreprocessISelDAG();
// Include the pieces autogenerated from the target description.
#include "X86GenDAGISel.inc"
@@ -209,18 +196,17 @@ namespace {
SDValue &Scale, SDValue &Index, SDValue &Disp);
bool SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp);
- bool SelectScalarSSELoad(SDNode *Op, SDValue Pred,
- SDValue N, SDValue &Base, SDValue &Scale,
+ bool SelectScalarSSELoad(SDNode *Root, SDValue N,
+ SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment,
- SDValue &InChain, SDValue &OutChain);
+ SDValue &NodeWithChain);
+
bool TryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment);
- void PreprocessForRMW();
- void PreprocessForFPConvert();
-
+
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
/// inline asm expressions.
virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
@@ -296,10 +282,6 @@ namespace {
const X86InstrInfo *getInstrInfo() {
return getTargetMachine().getInstrInfo();
}
-
-#ifndef NDEBUG
- unsigned Indent;
-#endif
};
}
@@ -367,65 +349,6 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
return true;
}
-
-bool X86DAGToDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root) const {
- if (OptLevel == CodeGenOpt::None) return false;
-
- // Proceed to 'generic' cycle finder code
- return SelectionDAGISel::IsLegalToFold(N, U, Root);
-}
-
-/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
-/// and move load below the TokenFactor. Replace store's chain operand with
-/// load's chain result.
-static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
- SDValue Store, SDValue TF) {
- SmallVector<SDValue, 4> Ops;
- for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
- if (Load.getNode() == TF.getOperand(i).getNode())
- Ops.push_back(Load.getOperand(0));
- else
- Ops.push_back(TF.getOperand(i));
- SDValue NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
- SDValue NewLoad = CurDAG->UpdateNodeOperands(Load, NewTF,
- Load.getOperand(1),
- Load.getOperand(2));
- CurDAG->UpdateNodeOperands(Store, NewLoad.getValue(1), Store.getOperand(1),
- Store.getOperand(2), Store.getOperand(3));
-}
-
-/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG. The
-/// chain produced by the load must only be used by the store's chain operand,
-/// otherwise this may produce a cycle in the DAG.
-///
-static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
- SDValue &Load) {
- if (N.getOpcode() == ISD::BIT_CONVERT) {
- if (!N.hasOneUse())
- return false;
- N = N.getOperand(0);
- }
-
- LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
- if (!LD || LD->isVolatile())
- return false;
- if (LD->getAddressingMode() != ISD::UNINDEXED)
- return false;
-
- ISD::LoadExtType ExtType = LD->getExtensionType();
- if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
- return false;
-
- if (N.hasOneUse() &&
- LD->hasNUsesOfValue(1, 1) &&
- N.getOperand(1) == Address &&
- LD->isOperandOf(Chain.getNode())) {
- Load = N;
- return true;
- }
- return false;
-}
-
/// MoveBelowCallSeqStart - Replace CALLSEQ_START operand with load's chain
/// operand and move load below the call's chain operand.
static void MoveBelowCallSeqStart(SelectionDAG *CurDAG, SDValue Load,
@@ -489,51 +412,14 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain) {
return false;
}
-
-/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
-/// This is only run if not in -O0 mode.
-/// This allows the instruction selector to pick more read-modify-write
-/// instructions. This is a common case:
-///
-/// [Load chain]
-/// ^
-/// |
-/// [Load]
-/// ^ ^
-/// | |
-/// / \-
-/// / |
-/// [TokenFactor] [Op]
-/// ^ ^
-/// | |
-/// \ /
-/// \ /
-/// [Store]
-///
-/// The fact the store's chain operand != load's chain will prevent the
-/// (store (op (load))) instruction from being selected. We can transform it to:
-///
-/// [Load chain]
-/// ^
-/// |
-/// [TokenFactor]
-/// ^
-/// |
-/// [Load]
-/// ^ ^
-/// | |
-/// | \-
-/// | |
-/// | [Op]
-/// | ^
-/// | |
-/// \ /
-/// \ /
-/// [Store]
-void X86DAGToDAGISel::PreprocessForRMW() {
+void X86DAGToDAGISel::PreprocessISelDAG() {
+ OptForSize = MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
- E = CurDAG->allnodes_end(); I != E; ++I) {
- if (I->getOpcode() == X86ISD::CALL) {
+ E = CurDAG->allnodes_end(); I != E; ) {
+ SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
+
+ if (OptLevel != CodeGenOpt::None && N->getOpcode() == X86ISD::CALL) {
/// Also try moving call address load from outside callseq_start to just
/// before the call to allow it to be folded.
///
@@ -553,85 +439,23 @@ void X86DAGToDAGISel::PreprocessForRMW() {
/// \ /
/// \ /
/// [CALL]
- SDValue Chain = I->getOperand(0);
- SDValue Load = I->getOperand(1);
+ SDValue Chain = N->getOperand(0);
+ SDValue Load = N->getOperand(1);
if (!isCalleeLoad(Load, Chain))
continue;
- MoveBelowCallSeqStart(CurDAG, Load, SDValue(I, 0), Chain);
+ MoveBelowCallSeqStart(CurDAG, Load, SDValue(N, 0), Chain);
++NumLoadMoved;
continue;
}
-
- if (!ISD::isNON_TRUNCStore(I))
- continue;
- SDValue Chain = I->getOperand(0);
-
- if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
- continue;
-
- SDValue N1 = I->getOperand(1);
- SDValue N2 = I->getOperand(2);
- if ((N1.getValueType().isFloatingPoint() &&
- !N1.getValueType().isVector()) ||
- !N1.hasOneUse())
- continue;
-
- bool RModW = false;
- SDValue Load;
- unsigned Opcode = N1.getNode()->getOpcode();
- switch (Opcode) {
- case ISD::ADD:
- case ISD::MUL:
- case ISD::AND:
- case ISD::OR:
- case ISD::XOR:
- case ISD::ADDC:
- case ISD::ADDE:
- case ISD::VECTOR_SHUFFLE: {
- SDValue N10 = N1.getOperand(0);
- SDValue N11 = N1.getOperand(1);
- RModW = isRMWLoad(N10, Chain, N2, Load);
- if (!RModW)
- RModW = isRMWLoad(N11, Chain, N2, Load);
- break;
- }
- case ISD::SUB:
- case ISD::SHL:
- case ISD::SRA:
- case ISD::SRL:
- case ISD::ROTL:
- case ISD::ROTR:
- case ISD::SUBC:
- case ISD::SUBE:
- case X86ISD::SHLD:
- case X86ISD::SHRD: {
- SDValue N10 = N1.getOperand(0);
- RModW = isRMWLoad(N10, Chain, N2, Load);
- break;
- }
- }
-
- if (RModW) {
- MoveBelowTokenFactor(CurDAG, Load, SDValue(I, 0), Chain);
- ++NumLoadMoved;
- checkForCycles(I);
- }
- }
-}
-
-
-/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend
-/// nodes that target the FP stack to be store and load to the stack. This is a
-/// gross hack. We would like to simply mark these as being illegal, but when
-/// we do that, legalize produces these when it expands calls, then expands
-/// these in the same legalize pass. We would like dag combine to be able to
-/// hack on these between the call expansion and the node legalization. As such
-/// this pass basically does "really late" legalization of these inline with the
-/// X86 isel pass.
-void X86DAGToDAGISel::PreprocessForFPConvert() {
- for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
- E = CurDAG->allnodes_end(); I != E; ) {
- SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
+
+ // Lower fpround and fpextend nodes that target the FP stack to be store and
+ // load to the stack. This is a gross hack. We would like to simply mark
+ // these as being illegal, but when we do that, legalize produces these when
+ // it expands calls, then expands these in the same legalize pass. We would
+ // like dag combine to be able to hack on these between the call expansion
+ // and the node legalization. As such this pass basically does "really
+ // late" legalization of these inline with the X86 isel pass.
+ // FIXME: This should only happen when not compiled with -O0.
if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
continue;
@@ -687,30 +511,6 @@ void X86DAGToDAGISel::PreprocessForFPConvert() {
}
}
-/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
-/// when it has created a SelectionDAG for us to codegen.
-void X86DAGToDAGISel::InstructionSelect() {
- const Function *F = MF->getFunction();
- OptForSize = F->hasFnAttr(Attribute::OptimizeForSize);
-
- if (OptLevel != CodeGenOpt::None)
- PreprocessForRMW();
-
- // FIXME: This should only happen when not compiled with -O0.
- PreprocessForFPConvert();
-
- // Codegen the basic block.
-#ifndef NDEBUG
- DEBUG(dbgs() << "===== Instruction selection begins:\n");
- Indent = 0;
-#endif
- SelectRoot(*CurDAG);
-#ifndef NDEBUG
- DEBUG(dbgs() << "===== Instruction selection ends:\n");
-#endif
-
- CurDAG->RemoveDeadNodes();
-}
/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
/// the main function.
@@ -1317,22 +1117,24 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Op, SDValue N, SDValue &Base,
/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to
/// match a load whose top elements are either undef or zeros. The load flavor
/// is derived from the type of N, which is either v4f32 or v2f64.
-bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Op, SDValue Pred,
+///
+/// We also return:
+/// PatternChainNode: this is the matched node that has a chain input and
+/// output.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment,
- SDValue &InChain,
- SDValue &OutChain) {
+ SDValue &PatternNodeWithChain) {
if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
- InChain = N.getOperand(0).getValue(1);
- if (ISD::isNON_EXTLoad(InChain.getNode()) &&
- InChain.getValue(0).hasOneUse() &&
- IsProfitableToFold(N, Pred.getNode(), Op) &&
- IsLegalToFold(N, Pred.getNode(), Op)) {
- LoadSDNode *LD = cast<LoadSDNode>(InChain);
- if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ PatternNodeWithChain = N.getOperand(0);
+ if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+ PatternNodeWithChain.hasOneUse() &&
+ IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+ IsLegalToFold(N.getOperand(0), N.getNode(), Root)) {
+ LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ if (!SelectAddr(Root, LD->getBasePtr(), Base, Scale, Index, Disp,Segment))
return false;
- OutChain = LD->getChain();
return true;
}
}
@@ -1344,13 +1146,14 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Op, SDValue Pred,
N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
N.getOperand(0).getNode()->hasOneUse() &&
ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
- N.getOperand(0).getOperand(0).hasOneUse()) {
+ N.getOperand(0).getOperand(0).hasOneUse() &&
+ IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+ IsLegalToFold(N.getOperand(0), N.getNode(), Root)) {
// Okay, this is a zero extending load. Fold it.
LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
- if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ if (!SelectAddr(Root, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
return false;
- OutChain = LD->getChain();
- InChain = SDValue(LD, 1);
+ PatternNodeWithChain = SDValue(LD, 0);
return true;
}
return false;
@@ -1424,7 +1227,6 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDNode *Op, SDValue N,
bool X86DAGToDAGISel::SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp) {
- assert(Op->getOpcode() == X86ISD::TLSADDR);
assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
@@ -1451,11 +1253,12 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
- if (ISD::isNON_EXTLoad(N.getNode()) &&
- IsProfitableToFold(N, P, P) &&
- IsLegalToFold(N, P, P))
- return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp, Segment);
- return false;
+ if (!ISD::isNON_EXTLoad(N.getNode()) ||
+ !IsProfitableToFold(N, P, P) ||
+ !IsLegalToFold(N, P, P))
+ return false;
+
+ return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
/// getGlobalBaseReg - Return an SDNode that returns the value of
@@ -1558,7 +1361,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
Opc = X86::LOCK_DEC16m;
else if (isSub) {
if (isCN) {
- if (Predicate_i16immSExt8(Val.getNode()))
+ if (Predicate_immSext8(Val.getNode()))
Opc = X86::LOCK_SUB16mi8;
else
Opc = X86::LOCK_SUB16mi;
@@ -1566,7 +1369,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
Opc = X86::LOCK_SUB16mr;
} else {
if (isCN) {
- if (Predicate_i16immSExt8(Val.getNode()))
+ if (Predicate_immSext8(Val.getNode()))
Opc = X86::LOCK_ADD16mi8;
else
Opc = X86::LOCK_ADD16mi;
@@ -1581,7 +1384,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
Opc = X86::LOCK_DEC32m;
else if (isSub) {
if (isCN) {
- if (Predicate_i32immSExt8(Val.getNode()))
+ if (Predicate_immSext8(Val.getNode()))
Opc = X86::LOCK_SUB32mi8;
else
Opc = X86::LOCK_SUB32mi;
@@ -1589,7 +1392,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
Opc = X86::LOCK_SUB32mr;
} else {
if (isCN) {
- if (Predicate_i32immSExt8(Val.getNode()))
+ if (Predicate_immSext8(Val.getNode()))
Opc = X86::LOCK_ADD32mi8;
else
Opc = X86::LOCK_ADD32mi;
@@ -1605,7 +1408,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
else if (isSub) {
Opc = X86::LOCK_SUB64mr;
if (isCN) {
- if (Predicate_i64immSExt8(Val.getNode()))
+ if (Predicate_immSext8(Val.getNode()))
Opc = X86::LOCK_SUB64mi8;
else if (Predicate_i64immSExt32(Val.getNode()))
Opc = X86::LOCK_SUB64mi32;
@@ -1613,7 +1416,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
} else {
Opc = X86::LOCK_ADD64mr;
if (isCN) {
- if (Predicate_i64immSExt8(Val.getNode()))
+ if (Predicate_immSext8(Val.getNode()))
Opc = X86::LOCK_ADD64mi8;
else if (Predicate_i64immSExt32(Val.getNode()))
Opc = X86::LOCK_ADD64mi32;
@@ -1710,24 +1513,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
unsigned Opcode = Node->getOpcode();
DebugLoc dl = Node->getDebugLoc();
-#ifndef NDEBUG
- DEBUG({
- dbgs() << std::string(Indent, ' ') << "Selecting: ";
- Node->dump(CurDAG);
- dbgs() << '\n';
- });
- Indent += 2;
-#endif
+ DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
if (Node->isMachineOpcode()) {
-#ifndef NDEBUG
- DEBUG({
- dbgs() << std::string(Indent-2, ' ') << "== ";
- Node->dump(CurDAG);
- dbgs() << '\n';
- });
- Indent -= 2;
-#endif
+ DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
return NULL; // Already selected.
}
@@ -1823,13 +1612,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
LoReg, NVT, InFlag);
InFlag = Result.getValue(2);
ReplaceUses(SDValue(Node, 0), Result);
-#ifndef NDEBUG
- DEBUG({
- dbgs() << std::string(Indent-2, ' ') << "=> ";
- Result.getNode()->dump(CurDAG);
- dbgs() << '\n';
- });
-#endif
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
}
// Copy the high half of the result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
@@ -1852,19 +1635,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
InFlag = Result.getValue(2);
}
ReplaceUses(SDValue(Node, 1), Result);
-#ifndef NDEBUG
- DEBUG({
- dbgs() << std::string(Indent-2, ' ') << "=> ";
- Result.getNode()->dump(CurDAG);
- dbgs() << '\n';
- });
-#endif
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
}
-#ifndef NDEBUG
- Indent -= 2;
-#endif
-
return NULL;
}
@@ -1979,13 +1752,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
LoReg, NVT, InFlag);
InFlag = Result.getValue(2);
ReplaceUses(SDValue(Node, 0), Result);
-#ifndef NDEBUG
- DEBUG({
- dbgs() << std::string(Indent-2, ' ') << "=> ";
- Result.getNode()->dump(CurDAG);
- dbgs() << '\n';
- });
-#endif
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
}
// Copy the remainder (high) result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
@@ -2009,19 +1776,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
InFlag = Result.getValue(2);
}
ReplaceUses(SDValue(Node, 1), Result);
-#ifndef NDEBUG
- DEBUG({
- dbgs() << std::string(Indent-2, ' ') << "=> ";
- Result.getNode()->dump(CurDAG);
- dbgs() << '\n';
- });
-#endif
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
}
-
-#ifndef NDEBUG
- Indent -= 2;
-#endif
-
return NULL;
}
@@ -2134,17 +1890,12 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
SDNode *ResNode = SelectCode(Node);
-#ifndef NDEBUG
- DEBUG({
- dbgs() << std::string(Indent-2, ' ') << "=> ";
- if (ResNode == NULL || ResNode == Node)
- Node->dump(CurDAG);
- else
- ResNode->dump(CurDAG);
- dbgs() << '\n';
- });
- Indent -= 2;
-#endif
+ DEBUG(dbgs() << "=> ";
+ if (ResNode == NULL || ResNode == Node)
+ Node->dump(CurDAG);
+ else
+ ResNode->dump(CurDAG);
+ dbgs() << '\n');
return ResNode;
}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9974d8c..e2b8193 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -73,7 +73,7 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
case X86Subtarget::isDarwin:
if (TM.getSubtarget<X86Subtarget>().is64Bit())
return new X8664_MachoTargetObjectFile();
- return new X8632_MachoTargetObjectFile();
+ return new TargetLoweringObjectFileMachO();
case X86Subtarget::isELF:
if (TM.getSubtarget<X86Subtarget>().is64Bit())
return new X8664_ELFTargetObjectFile(TM);
@@ -990,6 +990,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
@@ -1743,7 +1744,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
// Calculate the new stack slot for the return address.
int SlotSize = Is64Bit ? 8 : 4;
int NewReturnAddrFI =
- MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, true,false);
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false);
EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
@@ -2376,7 +2377,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
// Set up a frame object for the return address.
uint64_t SlotSize = TD->getPointerSize();
ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
- true, false);
+ false, false);
FuncInfo->setRAIndex(ReturnAddrIndex);
}
@@ -4816,8 +4817,16 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
isa<ConstantSDNode>(N2)) {
- unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB
- : X86ISD::PINSRW;
+ unsigned Opc;
+ if (VT == MVT::v8i16)
+ Opc = X86ISD::PINSRW;
+ else if (VT == MVT::v4i16)
+ Opc = X86ISD::MMX_PINSRW;
+ else if (VT == MVT::v16i8)
+ Opc = X86ISD::PINSRB;
+ else
+ Opc = X86ISD::PINSRB;
+
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
// argument.
if (N1.getValueType() != MVT::i32)
@@ -4868,7 +4877,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
if (N2.getValueType() != MVT::i32)
N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
- return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+ return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW,
+ dl, VT, N0, N1, N2);
}
return SDValue();
}
@@ -5244,7 +5254,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits, MVT::i8));
- SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
+ SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
AndNode, DAG.getConstant(0, MVT::i8));
SDValue Hi, Lo;
@@ -5873,26 +5883,31 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
/// if it's possible.
-static SDValue LowerToBT(SDValue Op0, ISD::CondCode CC,
+static SDValue LowerToBT(SDValue And, ISD::CondCode CC,
DebugLoc dl, SelectionDAG &DAG) {
+ SDValue Op0 = And.getOperand(0);
+ SDValue Op1 = And.getOperand(1);
+ if (Op0.getOpcode() == ISD::TRUNCATE)
+ Op0 = Op0.getOperand(0);
+ if (Op1.getOpcode() == ISD::TRUNCATE)
+ Op1 = Op1.getOperand(0);
+
SDValue LHS, RHS;
- if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
- if (ConstantSDNode *Op010C =
- dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
- if (Op010C->getZExtValue() == 1) {
- LHS = Op0.getOperand(0);
- RHS = Op0.getOperand(1).getOperand(1);
+ if (Op1.getOpcode() == ISD::SHL) {
+ if (ConstantSDNode *And10C = dyn_cast<ConstantSDNode>(Op1.getOperand(0)))
+ if (And10C->getZExtValue() == 1) {
+ LHS = Op0;
+ RHS = Op1.getOperand(1);
}
- } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
- if (ConstantSDNode *Op000C =
- dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
- if (Op000C->getZExtValue() == 1) {
- LHS = Op0.getOperand(1);
- RHS = Op0.getOperand(0).getOperand(1);
+ } else if (Op0.getOpcode() == ISD::SHL) {
+ if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
+ if (And00C->getZExtValue() == 1) {
+ LHS = Op1;
+ RHS = Op0.getOperand(1);
}
- } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
- ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
- SDValue AndLHS = Op0.getOperand(0);
+ } else if (Op1.getOpcode() == ISD::Constant) {
+ ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+ SDValue AndLHS = Op0;
if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
LHS = AndLHS.getOperand(0);
RHS = AndLHS.getOperand(1);
@@ -5942,6 +5957,21 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
return NewSetCC;
}
+ // Look for "(setcc) == / != 1" to avoid unncessary setcc.
+ if (Op0.getOpcode() == X86ISD::SETCC &&
+ Op1.getOpcode() == ISD::Constant &&
+ (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+ cast<ConstantSDNode>(Op1)->isNullValue()) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+ bool Invert = (CC == ISD::SETNE) ^
+ cast<ConstantSDNode>(Op1)->isNullValue();
+ if (Invert)
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
+ }
+
bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
if (X86CC == X86::COND_INVALID)
@@ -6444,8 +6474,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
false, false, false, false,
0, CallingConv::C, false, /*isReturnValueUsed=*/false,
- DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl,
- DAG.GetOrdering(Chain.getNode()));
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
return CallResult.second;
}
@@ -7662,6 +7691,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
case X86ISD::PINSRB: return "X86ISD::PINSRB";
case X86ISD::PINSRW: return "X86ISD::PINSRW";
+ case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMIN: return "X86ISD::FMIN";
@@ -7769,7 +7799,7 @@ bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
if (NumBits1 <= NumBits2)
return false;
- return Subtarget->is64Bit() || NumBits1 < 64;
+ return true;
}
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
@@ -7779,7 +7809,7 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
unsigned NumBits2 = VT2.getSizeInBits();
if (NumBits1 <= NumBits2)
return false;
- return Subtarget->is64Bit() || NumBits1 < 64;
+ return true;
}
bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
@@ -8792,10 +8822,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
SDValue RHS = N->getOperand(2);
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
- // instructions have the peculiarity that if either operand is a NaN,
- // they chose what we call the RHS operand (and as such are not symmetric).
- // It happens that this matches the semantics of the common C idiom
- // x<y?x:y and related forms, so we can recognize these cases.
+ // instructions match the semantics of the common C idiom x<y?x:y but not
+ // x<=y?x:y, because of how they handle negative zero (which can be
+ // ignored in unsafe-math mode).
if (Subtarget->hasSSE2() &&
(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
Cond.getOpcode() == ISD::SETCC) {
@@ -8803,36 +8832,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
unsigned Opcode = 0;
// Check for x CC y ? x : y.
- if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+ if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
switch (CC) {
default: break;
case ISD::SETULT:
- // This can be a min if we can prove that at least one of the operands
- // is not a nan.
- if (!FiniteOnlyFPMath()) {
- if (DAG.isKnownNeverNaN(RHS)) {
- // Put the potential NaN in the RHS so that SSE will preserve it.
- std::swap(LHS, RHS);
- } else if (!DAG.isKnownNeverNaN(LHS))
+ // Converting this to a min would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+ if (!UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
break;
+ std::swap(LHS, RHS);
}
Opcode = X86ISD::FMIN;
break;
case ISD::SETOLE:
- // This can be a min if we can prove that at least one of the operands
- // is not a nan.
- if (!FiniteOnlyFPMath()) {
- if (DAG.isKnownNeverNaN(LHS)) {
- // Put the potential NaN in the RHS so that SSE will preserve it.
- std::swap(LHS, RHS);
- } else if (!DAG.isKnownNeverNaN(RHS))
- break;
- }
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ break;
Opcode = X86ISD::FMIN;
break;
case ISD::SETULE:
- // This can be a min, but if either operand is a NaN we need it to
- // preserve the original LHS.
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
case ISD::SETOLT:
case ISD::SETLT:
@@ -8841,32 +8868,29 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
break;
case ISD::SETOGE:
- // This can be a max if we can prove that at least one of the operands
- // is not a nan.
- if (!FiniteOnlyFPMath()) {
- if (DAG.isKnownNeverNaN(LHS)) {
- // Put the potential NaN in the RHS so that SSE will preserve it.
- std::swap(LHS, RHS);
- } else if (!DAG.isKnownNeverNaN(RHS))
- break;
- }
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
+ break;
Opcode = X86ISD::FMAX;
break;
case ISD::SETUGT:
- // This can be a max if we can prove that at least one of the operands
- // is not a nan.
- if (!FiniteOnlyFPMath()) {
- if (DAG.isKnownNeverNaN(RHS)) {
- // Put the potential NaN in the RHS so that SSE will preserve it.
- std::swap(LHS, RHS);
- } else if (!DAG.isKnownNeverNaN(LHS))
+ // Converting this to a max would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+ if (!UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
break;
+ std::swap(LHS, RHS);
}
Opcode = X86ISD::FMAX;
break;
case ISD::SETUGE:
- // This can be a max, but if either operand is a NaN we need it to
- // preserve the original LHS.
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
case ISD::SETOGT:
case ISD::SETGT:
@@ -8875,36 +8899,33 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
break;
}
// Check for x CC y ? y : x -- a min/max with reversed arms.
- } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
+ } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(0))) {
switch (CC) {
default: break;
case ISD::SETOGE:
- // This can be a min if we can prove that at least one of the operands
- // is not a nan.
- if (!FiniteOnlyFPMath()) {
- if (DAG.isKnownNeverNaN(RHS)) {
- // Put the potential NaN in the RHS so that SSE will preserve it.
- std::swap(LHS, RHS);
- } else if (!DAG.isKnownNeverNaN(LHS))
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
break;
+ std::swap(LHS, RHS);
}
Opcode = X86ISD::FMIN;
break;
case ISD::SETUGT:
- // This can be a min if we can prove that at least one of the operands
- // is not a nan.
- if (!FiniteOnlyFPMath()) {
- if (DAG.isKnownNeverNaN(LHS)) {
- // Put the potential NaN in the RHS so that SSE will preserve it.
- std::swap(LHS, RHS);
- } else if (!DAG.isKnownNeverNaN(RHS))
- break;
- }
+ // Converting this to a min would handle NaNs incorrectly.
+ if (!UnsafeFPMath &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ break;
Opcode = X86ISD::FMIN;
break;
case ISD::SETUGE:
- // This can be a min, but if either operand is a NaN we need it to
- // preserve the original LHS.
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
case ISD::SETOGT:
case ISD::SETGT:
@@ -8913,32 +8934,28 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
break;
case ISD::SETULT:
- // This can be a max if we can prove that at least one of the operands
- // is not a nan.
- if (!FiniteOnlyFPMath()) {
- if (DAG.isKnownNeverNaN(LHS)) {
- // Put the potential NaN in the RHS so that SSE will preserve it.
- std::swap(LHS, RHS);
- } else if (!DAG.isKnownNeverNaN(RHS))
- break;
- }
+ // Converting this to a max would handle NaNs incorrectly.
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ break;
Opcode = X86ISD::FMAX;
break;
case ISD::SETOLE:
- // This can be a max if we can prove that at least one of the operands
- // is not a nan.
- if (!FiniteOnlyFPMath()) {
- if (DAG.isKnownNeverNaN(RHS)) {
- // Put the potential NaN in the RHS so that SSE will preserve it.
- std::swap(LHS, RHS);
- } else if (!DAG.isKnownNeverNaN(LHS))
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
break;
+ std::swap(LHS, RHS);
}
Opcode = X86ISD::FMAX;
break;
case ISD::SETULE:
- // This can be a max, but if either operand is a NaN we need it to
- // preserve the original LHS.
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
case ISD::SETOLT:
case ISD::SETLT:
@@ -9157,16 +9174,64 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// PerformANDCombine - Look for SSE and instructions of this form:
+/// (and x, (build_vector signbit,signbit,signbit,signbit)). If there
+/// exists a use of a build_vector that's the bitwise complement of the mask,
+/// then transform the node to
+/// (and (xor x, (build_vector -1,-1,-1,-1)), (build_vector ~sb,~sb,~sb,~sb)).
+static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || !VT.isInteger())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ if (N0.getOpcode() == ISD::XOR || !N1.hasOneUse())
+ return SDValue();
+
+ if (N1.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ SmallVector<SDValue, 8> Mask;
+ Mask.reserve(NumElts);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ SDValue Arg = N1.getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) {
+ Mask.push_back(Arg);
+ continue;
+ }
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Arg);
+ if (!C)
+ return SDValue();
+ if (!C->getAPIntValue().isSignBit() &&
+ !C->getAPIntValue().isMaxSignedValue())
+ return SDValue();
+ Mask.push_back(DAG.getConstant(~C->getAPIntValue(), EltVT));
+ }
+ N1 = DAG.getNode(ISD::BUILD_VECTOR, N1.getDebugLoc(), VT,
+ &Mask[0], NumElts);
+ if (!N1.use_empty()) {
+ unsigned Bits = EltVT.getSizeInBits();
+ Mask.clear();
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(DAG.getConstant(APInt::getAllOnesValue(Bits), EltVT));
+ SDValue NewMask = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+ VT, &Mask[0], NumElts);
+ return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+ DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+ N0, NewMask), N1);
+ }
+ }
+
+ return SDValue();
+}
/// PerformMulCombine - Optimize a single multiply with constant into two
/// in order to implement it with two cheaper instructions, e.g.
/// LEA + SHL, LEA + LEA.
static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
- if (DAG.getMachineFunction().
- getFunction()->hasFnAttr(Attribute::OptimizeForSize))
- return SDValue();
-
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
@@ -9305,7 +9370,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
}
} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
- unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
+ unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
if (C->getZExtValue() == SplatIdx)
BaseShAmt = InVec.getOperand(1);
}
@@ -9690,6 +9755,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
+ case ISD::AND: return PerformANDCombine(N, DAG, DCI);
case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
case ISD::SHL:
case ISD::SRA:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index cf0eb40..ffaf1cf 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -180,7 +180,7 @@ namespace llvm {
/// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
/// corresponds to X86::PINSRW.
- PINSRW,
+ PINSRW, MMX_PINSRW,
/// PSHUFB - Shuffle 16 8-bit values within a vector.
PSHUFB,
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index 4ea3739..8462255 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -59,10 +59,11 @@ def tls64addr : ComplexPattern<i64, 4, "SelectTLSADDRAddr",
// Pattern fragments.
//
-def i64immSExt8 : PatLeaf<(i64 imm), [{
- // i64immSExt8 predicate - True if the 64-bit immediate fits in a 8-bit
- // sign extended field.
- return (int64_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+def i64immSExt8 : PatLeaf<(i64 immSext8)>;
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 32 bits.
+ return getI32Imm((unsigned)N->getZExtValue());
}]>;
def i64immSExt32 : PatLeaf<(i64 imm), [{
@@ -71,6 +72,7 @@ def i64immSExt32 : PatLeaf<(i64 imm), [{
return (int64_t)N->getZExtValue() == (int32_t)N->getZExtValue();
}]>;
+
def i64immZExt32 : PatLeaf<(i64 imm), [{
// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
// unsignedsign extended field.
@@ -325,7 +327,7 @@ def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (load addr:$src))]>;
@@ -556,7 +558,7 @@ def ADC64mi8 : RIi8<0x83, MRM2m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
addr:$dst)]>;
def ADC64mi32 : RIi32<0x81, MRM2m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
"adc{q}\t{$src2, $dst|$dst, $src2}",
- [(store (adde (load addr:$dst), i64immSExt8:$src2),
+ [(store (adde (load addr:$dst), i64immSExt32:$src2),
addr:$dst)]>;
} // Uses = [EFLAGS]
@@ -1981,7 +1983,7 @@ def : Pat<(and GR64:$src, i64immZExt32:$imm),
(i64 0),
(AND32ri
(EXTRACT_SUBREG GR64:$src, x86_subreg_32bit),
- imm:$imm),
+ (i32 (GetLo32XForm imm:$imm))),
x86_subreg_32bit)>;
// r & (2^32-1) ==> movz
@@ -2105,34 +2107,34 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
// (shl x (and y, 63)) ==> (shl x, y)
-def : Pat<(shl GR64:$src1, (and CL:$amt, 63)),
+def : Pat<(shl GR64:$src1, (and CL, 63)),
(SHL64rCL GR64:$src1)>;
-def : Pat<(store (shl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
(SHL64mCL addr:$dst)>;
-def : Pat<(srl GR64:$src1, (and CL:$amt, 63)),
+def : Pat<(srl GR64:$src1, (and CL, 63)),
(SHR64rCL GR64:$src1)>;
-def : Pat<(store (srl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
(SHR64mCL addr:$dst)>;
-def : Pat<(sra GR64:$src1, (and CL:$amt, 63)),
+def : Pat<(sra GR64:$src1, (and CL, 63)),
(SAR64rCL GR64:$src1)>;
-def : Pat<(store (sra (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
(SAR64mCL addr:$dst)>;
// Double shift patterns
-def : Pat<(shrd GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+def : Pat<(shrd GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm)),
(SHRD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
def : Pat<(store (shrd (loadi64 addr:$dst), (i8 imm:$amt1),
- GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+ GR64:$src2, (i8 imm)), addr:$dst),
(SHRD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
-def : Pat<(shld GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+def : Pat<(shld GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm)),
(SHLD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
def : Pat<(store (shld (loadi64 addr:$dst), (i8 imm:$amt1),
- GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+ GR64:$src2, (i8 imm)), addr:$dst),
(SHLD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index e22a903..ae24bfb 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -397,7 +397,7 @@ def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
let canFoldAsLoad = 1 in {
def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
[(set RFP32:$dst, (loadf32 addr:$src))]>;
-let isReMaterializable = 1, mayHaveSideEffects = 1 in
+let isReMaterializable = 1 in
def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
[(set RFP64:$dst, (loadf64 addr:$src))]>;
def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index a0d0312..39bda04 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -276,11 +276,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::MOVDQArr, X86::MOVDQAmr, 0, 16 },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
{ X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
- { X86::MOVPS2SSrr, X86::MOVPS2SSmr, 0, 0 },
- { X86::MOVSDrr, X86::MOVSDmr, 0, 0 },
{ X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
{ X86::MOVSS2DIrr, X86::MOVSS2DImr, 0, 0 },
- { X86::MOVSSrr, X86::MOVSSmr, 0, 0 },
{ X86::MOVUPDrr, X86::MOVUPDmr, 0, 0 },
{ X86::MOVUPSrr, X86::MOVUPSmr, 0, 0 },
{ X86::MUL16r, X86::MUL16m, 1, 0 },
@@ -389,12 +386,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, 16 },
- { X86::MOVSD2PDrr, X86::MOVSD2PDrm, 0 },
- { X86::MOVSDrr, X86::MOVSDrm, 0 },
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, 16 },
{ X86::MOVSLDUPrr, X86::MOVSLDUPrm, 16 },
- { X86::MOVSS2PSrr, X86::MOVSS2PSrm, 0 },
- { X86::MOVSSrr, X86::MOVSSrm, 0 },
{ X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
{ X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
{ X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
@@ -682,23 +675,20 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
case X86::MOV16rr:
case X86::MOV32rr:
case X86::MOV64rr:
- case X86::MOVSSrr:
- case X86::MOVSDrr:
// FP Stack register class copies
case X86::MOV_Fp3232: case X86::MOV_Fp6464: case X86::MOV_Fp8080:
case X86::MOV_Fp3264: case X86::MOV_Fp3280:
case X86::MOV_Fp6432: case X86::MOV_Fp8032:
-
+
+ // Note that MOVSSrr and MOVSDrr are not considered copies. FR32 and FR64
+ // copies are done with FsMOVAPSrr and FsMOVAPDrr.
+
case X86::FsMOVAPSrr:
case X86::FsMOVAPDrr:
case X86::MOVAPSrr:
case X86::MOVAPDrr:
case X86::MOVDQArr:
- case X86::MOVSS2PSrr:
- case X86::MOVSD2PDrr:
- case X86::MOVPS2SSrr:
- case X86::MOVPD2SDrr:
case X86::MMX_MOVQ64rr:
assert(MI.getNumOperands() >= 2 &&
MI.getOperand(0).isReg() &&
@@ -1083,7 +1073,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
case X86::MOV8r0: Opc = X86::MOV8ri; break;
case X86::MOV16r0: Opc = X86::MOV16ri; break;
case X86::MOV32r0: Opc = X86::MOV32ri; break;
- case X86::MOV64r0: Opc = X86::MOV64ri; break;
+ case X86::MOV64r0: Opc = X86::MOV64ri64i32; break;
}
Clone = false;
}
@@ -1860,7 +1850,7 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
CommonRC = SrcRC;
else if (!DestRC->hasSubClass(SrcRC)) {
// Neither of GR64_NOREX or GR64_NOSP is a superclass of the other,
- // but we want to copy then as GR64. Similarly, for GR32_NOREX and
+ // but we want to copy them as GR64. Similarly, for GR32_NOREX and
// GR32_NOSP, copy as GR32.
if (SrcRC->hasSuperClass(&X86::GR64RegClass) &&
DestRC->hasSuperClass(&X86::GR64RegClass))
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 25cd297..cfe71a5 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -343,18 +343,37 @@ def X86_COND_O : PatLeaf<(i8 13)>;
def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE
def X86_COND_S : PatLeaf<(i8 15)>;
-def i16immSExt8 : PatLeaf<(i16 imm), [{
- // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit
- // sign extended field.
- return (int16_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+def immSext8 : PatLeaf<(imm), [{
+ return N->getSExtValue() == (int8_t)N->getSExtValue();
}]>;
-def i32immSExt8 : PatLeaf<(i32 imm), [{
- // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
- // sign extended field.
- return (int32_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+def i16immSExt8 : PatLeaf<(i16 immSext8)>;
+def i32immSExt8 : PatLeaf<(i32 immSext8)>;
+
+/// Load patterns: these constraint the match to the right address space.
+def dsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ return true;
+}]>;
+
+def gsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ return PT->getAddressSpace() == 256;
+ return false;
+}]>;
+
+def fsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ return PT->getAddressSpace() == 257;
+ return false;
}]>;
+
// Helper fragments for loads.
// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
// known to be 32-bit aligned or better. Ditto for i8 to i16.
@@ -372,8 +391,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
return false;
}]>;
-def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),
-[{
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
LoadSDNode *LD = cast<LoadSDNode>(N);
if (const Value *Src = LD->getSrcValue())
if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
@@ -399,72 +417,11 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
return false;
}]>;
-def nvloadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
- LoadSDNode *LD = cast<LoadSDNode>(N);
- if (const Value *Src = LD->getSrcValue())
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
- if (PT->getAddressSpace() > 255)
- return false;
- if (LD->isVolatile())
- return false;
- ISD::LoadExtType ExtType = LD->getExtensionType();
- if (ExtType == ISD::NON_EXTLOAD)
- return true;
- if (ExtType == ISD::EXTLOAD)
- return LD->getAlignment() >= 4;
- return false;
-}]>;
-
-def gsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
- return PT->getAddressSpace() == 256;
- return false;
-}]>;
-
-def fsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
- return PT->getAddressSpace() == 257;
- return false;
-}]>;
-
-def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr)), [{
- if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
- if (PT->getAddressSpace() > 255)
- return false;
- return true;
-}]>;
-def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr)), [{
- if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
- if (PT->getAddressSpace() > 255)
- return false;
- return true;
-}]>;
-
-def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr)), [{
- if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
- if (PT->getAddressSpace() > 255)
- return false;
- return true;
-}]>;
-def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr)), [{
- if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
- if (PT->getAddressSpace() > 255)
- return false;
- return true;
-}]>;
-def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr)), [{
- if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
- if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
- if (PT->getAddressSpace() > 255)
- return false;
- return true;
-}]>;
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (dsload node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (dsload node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (dsload node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (dsload node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (dsload node:$ptr))>;
def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
@@ -1037,7 +994,7 @@ def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"mov{l}\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
[(set GR8:$dst, (loadi8 addr:$src))]>;
@@ -1071,7 +1028,7 @@ def MOV8mr_NOREX : I<0x88, MRMDestMem,
(outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
"mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
let mayLoad = 1,
- canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+ canFoldAsLoad = 1, isReMaterializable = 1 in
def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
(outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
"mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
@@ -1156,7 +1113,7 @@ def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
} // neverHasSideEffects
// unsigned division/remainder
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+let Defs = [AX,EFLAGS], Uses = [AX] in
def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
"div{b}\t$src", []>;
let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
@@ -4442,12 +4399,6 @@ def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8 GR8 :$src)>;
def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;
def : Pat<(i32 (anyext GR16:$src)), (MOVZX32rr16 GR16:$src)>;
-// (and (i32 load), 255) -> (zextload i8)
-def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 255))),
- (MOVZX32rm8 addr:$src)>;
-def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 65535))),
- (MOVZX32rm16 addr:$src)>;
-
//===----------------------------------------------------------------------===//
// Some peepholes
//===----------------------------------------------------------------------===//
@@ -4543,43 +4494,43 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
// (shl x (and y, 31)) ==> (shl x, y)
-def : Pat<(shl GR8:$src1, (and CL:$amt, 31)),
+def : Pat<(shl GR8:$src1, (and CL, 31)),
(SHL8rCL GR8:$src1)>;
-def : Pat<(shl GR16:$src1, (and CL:$amt, 31)),
+def : Pat<(shl GR16:$src1, (and CL, 31)),
(SHL16rCL GR16:$src1)>;
-def : Pat<(shl GR32:$src1, (and CL:$amt, 31)),
+def : Pat<(shl GR32:$src1, (and CL, 31)),
(SHL32rCL GR32:$src1)>;
-def : Pat<(store (shl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (shl (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
(SHL8mCL addr:$dst)>;
-def : Pat<(store (shl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (shl (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
(SHL16mCL addr:$dst)>;
-def : Pat<(store (shl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (shl (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
(SHL32mCL addr:$dst)>;
-def : Pat<(srl GR8:$src1, (and CL:$amt, 31)),
+def : Pat<(srl GR8:$src1, (and CL, 31)),
(SHR8rCL GR8:$src1)>;
-def : Pat<(srl GR16:$src1, (and CL:$amt, 31)),
+def : Pat<(srl GR16:$src1, (and CL, 31)),
(SHR16rCL GR16:$src1)>;
-def : Pat<(srl GR32:$src1, (and CL:$amt, 31)),
+def : Pat<(srl GR32:$src1, (and CL, 31)),
(SHR32rCL GR32:$src1)>;
-def : Pat<(store (srl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (srl (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
(SHR8mCL addr:$dst)>;
-def : Pat<(store (srl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (srl (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
(SHR16mCL addr:$dst)>;
-def : Pat<(store (srl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (srl (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
(SHR32mCL addr:$dst)>;
-def : Pat<(sra GR8:$src1, (and CL:$amt, 31)),
+def : Pat<(sra GR8:$src1, (and CL, 31)),
(SAR8rCL GR8:$src1)>;
-def : Pat<(sra GR16:$src1, (and CL:$amt, 31)),
+def : Pat<(sra GR16:$src1, (and CL, 31)),
(SAR16rCL GR16:$src1)>;
-def : Pat<(sra GR32:$src1, (and CL:$amt, 31)),
+def : Pat<(sra GR32:$src1, (and CL, 31)),
(SAR32rCL GR32:$src1)>;
-def : Pat<(store (sra (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (sra (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
(SAR8mCL addr:$dst)>;
-def : Pat<(store (sra (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (sra (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
(SAR16mCL addr:$dst)>;
-def : Pat<(store (sra (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+def : Pat<(store (sra (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
(SAR32mCL addr:$dst)>;
// (or (x >> c) | (y << (32 - c))) ==> (shrd32 x, y, c)
@@ -4600,11 +4551,11 @@ def : Pat<(store (or (srl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
addr:$dst),
(SHRD32mrCL addr:$dst, GR32:$src2)>;
-def : Pat<(shrd GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+def : Pat<(shrd GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm/*:$amt2*/)),
(SHRD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
def : Pat<(store (shrd (loadi32 addr:$dst), (i8 imm:$amt1),
- GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+ GR32:$src2, (i8 imm/*:$amt2*/)), addr:$dst),
(SHRD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
// (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c)
@@ -4625,11 +4576,11 @@ def : Pat<(store (or (shl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
addr:$dst),
(SHLD32mrCL addr:$dst, GR32:$src2)>;
-def : Pat<(shld GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+def : Pat<(shld GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm/*:$amt2*/)),
(SHLD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
def : Pat<(store (shld (loadi32 addr:$dst), (i8 imm:$amt1),
- GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+ GR32:$src2, (i8 imm/*:$amt2*/)), addr:$dst),
(SHLD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
// (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c)
@@ -4650,11 +4601,11 @@ def : Pat<(store (or (srl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
addr:$dst),
(SHRD16mrCL addr:$dst, GR16:$src2)>;
-def : Pat<(shrd GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+def : Pat<(shrd GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm/*:$amt2*/)),
(SHRD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
def : Pat<(store (shrd (loadi16 addr:$dst), (i8 imm:$amt1),
- GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+ GR16:$src2, (i8 imm/*:$amt2*/)), addr:$dst),
(SHRD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
// (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c)
@@ -4675,11 +4626,11 @@ def : Pat<(store (or (shl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
addr:$dst),
(SHLD16mrCL addr:$dst, GR16:$src2)>;
-def : Pat<(shld GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+def : Pat<(shld GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm/*:$amt2*/)),
(SHLD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
def : Pat<(store (shld (loadi16 addr:$dst), (i8 imm:$amt1),
- GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+ GR16:$src2, (i8 imm/*:$amt2*/)), addr:$dst),
(SHLD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
// (anyext (setcc_carry)) -> (setcc_carry)
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 89f020c..c8e0723 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -141,7 +141,7 @@ def MMX_MOVD64rrv164 : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
let neverHasSideEffects = 1 in
def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
"movq\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set VR64:$dst, (load_mmx addr:$src))]>;
@@ -426,13 +426,15 @@ def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
// Extract / Insert
-def MMX_X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
-def MMX_X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+def MMX_X86pinsrw : SDNode<"X86ISD::MMX_PINSRW",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4i16>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+
def MMX_PEXTRWri : MMXIi8<0xC5, MRMSrcReg,
(outs GR32:$dst), (ins VR64:$src1, i16i8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1),
+ [(set GR32:$dst, (X86pextrw (v4i16 VR64:$src1),
(iPTR imm:$src2)))]>;
let Constraints = "$src1 = $dst" in {
def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
@@ -597,13 +599,6 @@ let AddedComplexity = 10 in {
(MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
}
-// Patterns to perform vector shuffling with a zeroed out vector.
-let AddedComplexity = 20 in {
- def : Pat<(bc_v2i32 (mmx_unpckl immAllZerosV,
- (v2i32 (scalar_to_vector (load_mmx addr:$src))))),
- (MMX_PUNPCKLDQrm VR64:$src, VR64:$src)>;
-}
-
// Some special case PANDN patterns.
// FIXME: Get rid of these.
def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 9b2140f..2743dba 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -160,6 +160,32 @@ def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
+// MOVNT Support
+// Like 'store', but requires the non-temporal bit to be set
+def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+ return ST->isNonTemporal();
+ return false;
+}]>;
+
+def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+ return ST->isNonTemporal() && !ST->isTruncatingStore() &&
+ ST->getAddressingMode() == ISD::UNINDEXED &&
+ ST->getAlignment() >= 16;
+ return false;
+}]>;
+
+def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+ return ST->isNonTemporal() &&
+ ST->getAlignment() < 16;
+ return false;
+}]>;
+
def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
@@ -344,18 +370,56 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in {
// SSE1 Instructions
//===----------------------------------------------------------------------===//
-// Move Instructions
-let neverHasSideEffects = 1 in
-def MOVSSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
- "movss\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+// Move Instructions. Register-to-register movss is not used for FR32
+// register copies because it's a partial register update; FsMOVAPSrr is
+// used instead. Register-to-register movss is not modeled as an INSERT_SUBREG
+// because INSERT_SUBREG requires that the insert be implementable in terms of
+// a copy, and just mentioned, we don't use movss for copies.
+let Constraints = "$src1 = $dst" in
+def MOVSSrr : SSI<0x10, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
+ "movss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (movl VR128:$src1, (scalar_to_vector FR32:$src2)))]>;
+
+// Extract the low 32-bit value from one vector and insert it into another.
+let AddedComplexity = 15 in
+def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
+ (MOVSSrr VR128:$src1,
+ (EXTRACT_SUBREG (v4f32 VR128:$src2), x86_subreg_ss))>;
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, x86_subreg_ss)>;
+
+// Loading from memory automatically zeroing upper bits.
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
"movss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (loadf32 addr:$src))]>;
+
+// MOVSSrm zeros the high parts of the register; represent this
+// with SUBREG_TO_REG.
+let AddedComplexity = 20 in {
+def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), x86_subreg_ss)>;
+def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), x86_subreg_ss)>;
+def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), x86_subreg_ss)>;
+}
+
+// Store scalar value to memory.
def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
"movss\t{$src, $dst|$dst, $src}",
[(store FR32:$src, addr:$dst)]>;
+// Extract and store.
+def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSSmr addr:$dst,
+ (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss))>;
+
// Conversion instructions
def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
"cvttss2si\t{$src, $dst|$dst, $src}",
@@ -518,7 +582,7 @@ def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
// disregarded.
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
@@ -715,7 +779,7 @@ defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
let neverHasSideEffects = 1 in
def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
@@ -727,7 +791,7 @@ def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
let neverHasSideEffects = 1 in
def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movups\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"movups\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (loadv4f32 addr:$src))]>;
@@ -736,7 +800,7 @@ def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
[(store (v4f32 VR128:$src), addr:$dst)]>;
// Intrinsic forms of MOVUPS load and store
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"movups\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
@@ -796,9 +860,9 @@ def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
let AddedComplexity = 20 in {
def : Pat<(v4f32 (movddup VR128:$src, (undef))),
- (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+ (MOVLHPSrr VR128:$src, VR128:$src)>;
def : Pat<(v2i64 (movddup VR128:$src, (undef))),
- (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+ (MOVLHPSrr VR128:$src, VR128:$src)>;
}
@@ -1013,10 +1077,33 @@ def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
"prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
// Non-temporal stores
-def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
[(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+
+def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>;
+
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+ (MOVNTDQ_64mr VR128:$src, addr:$dst)>;
+
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movnti\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
+ TB, Requires<[HasSSE2]>;
+
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movnti\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
+ TB, Requires<[HasSSE2]>;
+}
+
// Load, store, and memory fence
def SFENCE : PSI<0xAE, MRM7r, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>;
@@ -1035,84 +1122,73 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllZerosV))]>;
-let Predicates = [HasSSE1] in {
- def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
- def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
- def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
- def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
- def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
-}
-
-// FR32 to 128-bit vector conversion.
-let isAsCheapAsAMove = 1 in
-def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
- "movss\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v4f32 (scalar_to_vector FR32:$src)))]>;
-def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
- "movss\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
-
-// FIXME: may not be able to eliminate this movss with coalescing the src and
-// dest register classes are different. We really want to write this pattern
-// like this:
-// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-// (f32 FR32:$src)>;
-let isAsCheapAsAMove = 1 in
-def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins VR128:$src),
- "movss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
- (iPTR 0)))]>;
-def MOVPS2SSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
- "movss\t{$src, $dst|$dst, $src}",
- [(store (f32 (vector_extract (v4f32 VR128:$src),
- (iPTR 0))), addr:$dst)]>;
-
-
-// Move to lower bits of a VR128, leaving upper bits alone.
-// Three operand (but two address) aliases.
-let Constraints = "$src1 = $dst" in {
-let neverHasSideEffects = 1 in
- def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
- (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
- "movss\t{$src2, $dst|$dst, $src2}", []>;
-
- let AddedComplexity = 15 in
- def MOVLPSrr : SSI<0x10, MRMSrcReg,
- (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
- "movss\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst,
- (v4f32 (movl VR128:$src1, VR128:$src2)))]>;
-}
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
-// Move to lower bits of a VR128 and zeroing upper bits.
-// Loading from memory automatically zeroing upper bits.
-let AddedComplexity = 20 in
-def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
- "movss\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v4f32 (X86vzmovl (v4f32 (scalar_to_vector
- (loadf32 addr:$src))))))]>;
-
-def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (MOVZSS2PSrm addr:$src)>;
+def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss))>;
//===---------------------------------------------------------------------===//
// SSE2 Instructions
//===---------------------------------------------------------------------===//
-// Move Instructions
-let neverHasSideEffects = 1 in
-def MOVSDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
- "movsd\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+// Move Instructions. Register-to-register movsd is not used for FR64
+// register copies because it's a partial register update; FsMOVAPDrr is
+// used instead. Register-to-register movsd is not modeled as an INSERT_SUBREG
+// because INSERT_SUBREG requires that the insert be implementable in terms of
+// a copy, and just mentioned, we don't use movsd for copies.
+let Constraints = "$src1 = $dst" in
+def MOVSDrr : SDI<0x10, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
+ "movsd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (movl VR128:$src1, (scalar_to_vector FR64:$src2)))]>;
+
+// Extract the low 64-bit value from one vector and insert it into another.
+let AddedComplexity = 15 in
+def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1,
+ (EXTRACT_SUBREG (v2f64 VR128:$src2), x86_subreg_sd))>;
+
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, x86_subreg_sd)>;
+
+// Loading from memory automatically zeroing upper bits.
+let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 20 in
def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
"movsd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (loadf64 addr:$src))]>;
+
+// MOVSDrm zeros the high parts of the register; represent this
+// with SUBREG_TO_REG.
+let AddedComplexity = 20 in {
+def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+def : Pat<(v2f64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), x86_subreg_sd)>;
+}
+
+// Store scalar value to memory.
def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
"movsd\t{$src, $dst|$dst, $src}",
[(store FR64:$src, addr:$dst)]>;
+// Extract and store.
+def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSDmr addr:$dst,
+ (EXTRACT_SUBREG (v2f64 VR128:$src), x86_subreg_sd))>;
+
// Conversion instructions
def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
"cvttsd2si\t{$src, $dst|$dst, $src}",
@@ -1166,7 +1242,8 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
Requires<[HasSSE2, OptForSize]>;
def : Pat<(extloadf32 addr:$src),
- (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
+ (CVTSS2SDrr (MOVSSrm addr:$src))>,
+ Requires<[HasSSE2, OptForSpeed]>;
// Match intrinsics which expect XMM operand(s).
def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
@@ -1285,7 +1362,7 @@ def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
// disregarded.
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
"movapd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
@@ -1483,7 +1560,7 @@ defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
let neverHasSideEffects = 1 in
def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movapd\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"movapd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
@@ -2298,17 +2375,30 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
[(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
// Non-temporal stores
-def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
- "movntpd\t{$src, $dst|$dst, $src}",
- [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
-def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
- "movntdq\t{$src, $dst|$dst, $src}",
- [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
-def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"movnti\t{$src, $dst|$dst, $src}",
[(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
TB, Requires<[HasSSE2]>;
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
+
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+
+def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+ (MOVNTDQmr VR128:$src, addr:$dst)>;
+}
+
// Flush cache
def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
"clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
@@ -2321,11 +2411,11 @@ def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
"mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
//TODO: custom lower this so as to never even generate the noop
-def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
+def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
(i8 0)), (NOOP)>;
def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
-def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
+def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
(i8 1)), (MFENCE)>;
// Alias instructions that map zero vector to pxor / xorp* for sse.
@@ -2337,17 +2427,6 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>;
-// FR64 to 128-bit vector conversion.
-let isAsCheapAsAMove = 1 in
-def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
- "movsd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v2f64 (scalar_to_vector FR64:$src)))]>;
-def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
- "movsd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
-
def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -2376,20 +2455,9 @@ def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
[(store (i64 (vector_extract (v2i64 VR128:$src),
(iPTR 0))), addr:$dst)]>;
-// FIXME: may not be able to eliminate this movss with coalescing the src and
-// dest register classes are different. We really want to write this pattern
-// like this:
-// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-// (f32 FR32:$src)>;
-let isAsCheapAsAMove = 1 in
-def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins VR128:$src),
- "movsd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (vector_extract (v2f64 VR128:$src),
- (iPTR 0)))]>;
-def MOVPD2SDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
- "movsd\t{$src, $dst|$dst, $src}",
- [(store (f64 (vector_extract (v2f64 VR128:$src),
- (iPTR 0))), addr:$dst)]>;
+def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), x86_subreg_sd))>;
+
def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -2406,44 +2474,11 @@ def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
-
-// Move to lower bits of a VR128, leaving upper bits alone.
-// Three operand (but two address) aliases.
-let Constraints = "$src1 = $dst" in {
- let neverHasSideEffects = 1 in
- def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
- (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
- "movsd\t{$src2, $dst|$dst, $src2}", []>;
-
- let AddedComplexity = 15 in
- def MOVLPDrr : SDI<0x10, MRMSrcReg,
- (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
- "movsd\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst,
- (v2f64 (movl VR128:$src1, VR128:$src2)))]>;
-}
-
// Store / copy lower 64-bits of a XMM register.
def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
[(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
-// Move to lower bits of a VR128 and zeroing upper bits.
-// Loading from memory automatically zeroing upper bits.
-let AddedComplexity = 20 in {
-def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
- "movsd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v2f64 (X86vzmovl (v2f64 (scalar_to_vector
- (loadf64 addr:$src))))))]>;
-
-def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (MOVZSD2PDrm addr:$src)>;
-def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
- (MOVZSD2PDrm addr:$src)>;
-def : Pat<(v2f64 (X86vzload addr:$src)), (MOVZSD2PDrm addr:$src)>;
-}
-
// movd / movq to XMM register zero-extends
let AddedComplexity = 15 in {
def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
@@ -2989,13 +3024,15 @@ let Predicates = [HasSSE2] in {
let AddedComplexity = 15 in {
// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
- (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
+ (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
- (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE1]>;
+ (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
- (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+ (MOVSSrr (v4f32 (V_SET0)),
+ (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss)))>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
- (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+ (MOVSSrr (v4i32 (V_SET0)),
+ (EXTRACT_SUBREG (v4i32 VR128:$src), x86_subreg_ss))>;
}
// Splat v2f64 / v2i64
@@ -3013,8 +3050,7 @@ def : Pat<(unpckh (v2i64 VR128:$src), (undef)),
// Special unary SHUFPSrri case.
def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
(SHUFPSrri VR128:$src1, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>,
- Requires<[HasSSE1]>;
+ (SHUFFLE_get_shuf_imm VR128:$src3))>;
let AddedComplexity = 5 in
def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
(PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
@@ -3060,13 +3096,13 @@ def : Pat<(v4f32 (unpckl_undef:$src2 VR128:$src, (undef))),
}
let AddedComplexity = 10 in {
def : Pat<(v4f32 (unpckl_undef VR128:$src, (undef))),
- (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+ (UNPCKLPSrr VR128:$src, VR128:$src)>;
def : Pat<(v16i8 (unpckl_undef VR128:$src, (undef))),
- (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+ (PUNPCKLBWrr VR128:$src, VR128:$src)>;
def : Pat<(v8i16 (unpckl_undef VR128:$src, (undef))),
- (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+ (PUNPCKLWDrr VR128:$src, VR128:$src)>;
def : Pat<(v4i32 (unpckl_undef VR128:$src, (undef))),
- (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+ (PUNPCKLDQrr VR128:$src, VR128:$src)>;
}
// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
@@ -3080,13 +3116,13 @@ def : Pat<(v4f32 (unpckh_undef:$src2 VR128:$src, (undef))),
}
let AddedComplexity = 10 in {
def : Pat<(v4f32 (unpckh_undef VR128:$src, (undef))),
- (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+ (UNPCKHPSrr VR128:$src, VR128:$src)>;
def : Pat<(v16i8 (unpckh_undef VR128:$src, (undef))),
- (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+ (PUNPCKHBWrr VR128:$src, VR128:$src)>;
def : Pat<(v8i16 (unpckh_undef VR128:$src, (undef))),
- (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+ (PUNPCKHWDrr VR128:$src, VR128:$src)>;
def : Pat<(v4i32 (unpckh_undef VR128:$src, (undef))),
- (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+ (PUNPCKHDQrr VR128:$src, VR128:$src)>;
}
let AddedComplexity = 20 in {
@@ -3108,45 +3144,49 @@ def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
let AddedComplexity = 20 in {
// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
- (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
- (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
- (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
- (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
}
// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
- (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
- (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
addr:$src1),
- (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
- (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
let AddedComplexity = 15 in {
// Setting the lowest element in the vector.
def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
- (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+ (MOVSSrr (v4i32 VR128:$src1),
+ (EXTRACT_SUBREG (v4i32 VR128:$src2), x86_subreg_ss))>;
def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
- (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+ (MOVSDrr (v2i64 VR128:$src1),
+ (EXTRACT_SUBREG (v2i64 VR128:$src2), x86_subreg_sd))>;
-// vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd)
+// vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
- (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, x86_subreg_sd))>,
+ Requires<[HasSSE2]>;
def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
- (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, x86_subreg_sd))>,
+ Requires<[HasSSE2]>;
}
// vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
// fall back to this for SSE1)
def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
(SHUFPSrri VR128:$src2, VR128:$src1,
- (SHUFFLE_get_shuf_imm VR128:$src3))>, Requires<[HasSSE1]>;
+ (SHUFFLE_get_shuf_imm VR128:$src3))>;
// Set lowest element and zero upper elements.
let AddedComplexity = 15 in
@@ -3188,30 +3228,30 @@ def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
// Use movaps / movups for SSE integer load / store (one byte shorter).
def : Pat<(alignedloadv4i32 addr:$src),
- (MOVAPSrm addr:$src)>, Requires<[HasSSE1]>;
+ (MOVAPSrm addr:$src)>;
def : Pat<(loadv4i32 addr:$src),
- (MOVUPSrm addr:$src)>, Requires<[HasSSE1]>;
+ (MOVUPSrm addr:$src)>;
def : Pat<(alignedloadv2i64 addr:$src),
- (MOVAPSrm addr:$src)>, Requires<[HasSSE2]>;
+ (MOVAPSrm addr:$src)>;
def : Pat<(loadv2i64 addr:$src),
- (MOVUPSrm addr:$src)>, Requires<[HasSSE2]>;
+ (MOVUPSrm addr:$src)>;
def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
- (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+ (MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
- (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+ (MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
- (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+ (MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
- (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+ (MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v2i64 VR128:$src), addr:$dst),
- (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+ (MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v4i32 VR128:$src), addr:$dst),
- (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+ (MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v8i16 VR128:$src), addr:$dst),
- (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+ (MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
- (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+ (MOVUPSmr addr:$dst, VR128:$src)>;
//===----------------------------------------------------------------------===//
// SSE4.1 Instructions
@@ -3400,7 +3440,7 @@ let Constraints = "$src1 = $dst" in {
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR128:$dst,
- (OpNode VR128:$src1, (memop addr:$src2)))]>, OpSize;
+ (OpVT (OpNode VR128:$src1, (memop addr:$src2))))]>, OpSize;
def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp
index 91c0fbb..250634f 100644
--- a/lib/Target/X86/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/X86MCAsmInfo.cpp
@@ -55,6 +55,11 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) {
if (!is64Bit)
Data64bitsDirective = 0; // we can't emit a 64-bit unit
+ // Use ## as a comment string so that .s files generated by llvm can go
+ // through the GCC preprocessor without causing an error. This is needed
+ // because "clang foo.s" runs the C preprocessor, which is usually reserved
+ // for .S files on other systems. Perhaps this is because the file system
+ // wasn't always case preserving or something.
CommentString = "##";
PCSymbol = ".";
@@ -70,6 +75,8 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &Triple) {
AsmTransCBE = x86_asm_table;
AssemblerDialect = AsmWriterFlavor;
+ TextAlignFillValue = 0x90;
+
PrivateGlobalPrefix = ".L";
WeakRefDirective = "\t.weak\t";
PCSymbol = ".";
@@ -94,4 +101,6 @@ MCSection *X86ELFMCAsmInfo::getNonexecutableStackSection(MCContext &Ctx) const {
X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) {
AsmTransCBE = x86_asm_table;
AssemblerDialect = AsmWriterFlavor;
-} \ No newline at end of file
+
+ TextAlignFillValue = 0x90;
+}
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 8524236..946d6b2 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -191,6 +191,8 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
return &X86::GR16_NOREXRegClass;
else if (A == &X86::GR16_ABCDRegClass)
return &X86::GR16_ABCDRegClass;
+ } else if (B == &X86::FR32RegClass) {
+ return A;
}
break;
case 2:
@@ -207,6 +209,8 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
A == &X86::GR16_NOREXRegClass)
return &X86::GR16_ABCDRegClass;
+ } else if (B == &X86::FR64RegClass) {
+ return A;
}
break;
case 3:
@@ -234,6 +238,8 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
return &X86::GR32_NOREXRegClass;
else if (A == &X86::GR32_ABCDRegClass)
return &X86::GR64_ABCDRegClass;
+ } else if (B == &X86::VR128RegClass) {
+ return A;
}
break;
case 4:
@@ -446,8 +452,10 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const Function *F = MF.getFunction();
bool requiresRealignment =
- RealignStack && (MFI->getMaxAlignment() > StackAlign);
+ RealignStack && ((MFI->getMaxAlignment() > StackAlign) ||
+ F->hasFnAttr(Attribute::StackAlignment));
// FIXME: Currently we don't support stack realignment for functions with
// variable-sized allocas.
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 8fb5e92..e4bdb4e 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -35,7 +35,8 @@ namespace X86 {
/// these indices must be kept in sync with the class indices in the
/// X86RegisterInfo.td file.
enum SubregIndex {
- SUBREG_8BIT = 1, SUBREG_8BIT_HI = 2, SUBREG_16BIT = 3, SUBREG_32BIT = 4
+ SUBREG_8BIT = 1, SUBREG_8BIT_HI = 2, SUBREG_16BIT = 3, SUBREG_32BIT = 4,
+ SUBREG_SS = 1, SUBREG_SD = 2, SUBREG_XMM = 3
};
}
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 1559bf7..ed2ce6c 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -158,22 +158,22 @@ let Namespace = "X86" in {
def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
// YMM Registers, used by AVX instructions
- def YMM0: Register<"ymm0">, DwarfRegNum<[17, 21, 21]>;
- def YMM1: Register<"ymm1">, DwarfRegNum<[18, 22, 22]>;
- def YMM2: Register<"ymm2">, DwarfRegNum<[19, 23, 23]>;
- def YMM3: Register<"ymm3">, DwarfRegNum<[20, 24, 24]>;
- def YMM4: Register<"ymm4">, DwarfRegNum<[21, 25, 25]>;
- def YMM5: Register<"ymm5">, DwarfRegNum<[22, 26, 26]>;
- def YMM6: Register<"ymm6">, DwarfRegNum<[23, 27, 27]>;
- def YMM7: Register<"ymm7">, DwarfRegNum<[24, 28, 28]>;
- def YMM8: Register<"ymm8">, DwarfRegNum<[25, -2, -2]>;
- def YMM9: Register<"ymm9">, DwarfRegNum<[26, -2, -2]>;
- def YMM10: Register<"ymm10">, DwarfRegNum<[27, -2, -2]>;
- def YMM11: Register<"ymm11">, DwarfRegNum<[28, -2, -2]>;
- def YMM12: Register<"ymm12">, DwarfRegNum<[29, -2, -2]>;
- def YMM13: Register<"ymm13">, DwarfRegNum<[30, -2, -2]>;
- def YMM14: Register<"ymm14">, DwarfRegNum<[31, -2, -2]>;
- def YMM15: Register<"ymm15">, DwarfRegNum<[32, -2, -2]>;
+ def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegNum<[17, 21, 21]>;
+ def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegNum<[18, 22, 22]>;
+ def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegNum<[19, 23, 23]>;
+ def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegNum<[20, 24, 24]>;
+ def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegNum<[21, 25, 25]>;
+ def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegNum<[22, 26, 26]>;
+ def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegNum<[23, 27, 27]>;
+ def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegNum<[24, 28, 28]>;
+ def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegNum<[25, -2, -2]>;
+ def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegNum<[26, -2, -2]>;
+ def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegNum<[27, -2, -2]>;
+ def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegNum<[28, -2, -2]>;
+ def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegNum<[29, -2, -2]>;
+ def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegNum<[30, -2, -2]>;
+ def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegNum<[31, -2, -2]>;
+ def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegNum<[32, -2, -2]>;
// Floating point stack registers
def ST0 : Register<"st(0)">, DwarfRegNum<[33, 12, 11]>;
@@ -238,6 +238,10 @@ def x86_subreg_8bit_hi : PatLeaf<(i32 2)>;
def x86_subreg_16bit : PatLeaf<(i32 3)>;
def x86_subreg_32bit : PatLeaf<(i32 4)>;
+def x86_subreg_ss : PatLeaf<(i32 1)>;
+def x86_subreg_sd : PatLeaf<(i32 2)>;
+def x86_subreg_xmm : PatLeaf<(i32 3)>;
+
def : SubRegSet<1, [AX, CX, DX, BX, SP, BP, SI, DI,
R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W],
[AL, CL, DL, BL, SPL, BPL, SIL, DIL,
@@ -277,11 +281,31 @@ def : SubRegSet<4, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
[EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]>;
-def : SubRegSet<1, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+def : SubRegSet<1, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15],
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+
+def : SubRegSet<2, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15],
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+
+def : SubRegSet<3, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15],
[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+def : SubRegSet<1, [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15],
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+
+def : SubRegSet<2, [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15],
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+
//===----------------------------------------------------------------------===//
// Register Class Definitions... now that we have all of the pieces, define the
// top-level register classes. The order specified in the register list is
@@ -793,6 +817,7 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11,
XMM12, XMM13, XMM14, XMM15]> {
+ let SubRegClassList = [FR32, FR64];
let MethodProtos = [{
iterator allocation_order_end(const MachineFunction &MF) const;
}];
@@ -811,7 +836,9 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
def VR256 : RegisterClass<"X86", [ v8i32, v4i64, v8f32, v4f64],256,
[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
YMM8, YMM9, YMM10, YMM11,
- YMM12, YMM13, YMM14, YMM15]>;
+ YMM12, YMM13, YMM14, YMM15]> {
+ let SubRegClassList = [FR32, FR64, VR128];
+}
// Status flags registers.
def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 5e05c2f..594a470 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -20,9 +20,9 @@
namespace llvm {
class GlobalValue;
class TargetMachine;
-
+
/// PICStyles - The X86 backend supports a number of different styles of PIC.
-///
+///
namespace PICStyles {
enum Style {
StubPIC, // Used on i386-darwin in -fPIC mode.
@@ -46,7 +46,7 @@ protected:
/// PICStyle - Which PIC style to use
///
PICStyles::Style PICStyle;
-
+
/// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
/// none supported.
X86SSEEnum X86SSELevel;
@@ -58,7 +58,7 @@ protected:
/// HasCMov - True if this processor has conditional move instructions
/// (generally pentium pro+).
bool HasCMov;
-
+
/// HasX86_64 - True if the processor supports X86-64 instructions.
///
bool HasX86_64;
@@ -78,8 +78,9 @@ protected:
/// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
bool IsBTMemSlow;
- /// HasVectorUAMem - True if SIMD operations can have unaligned memory operands.
- /// This may require setting a feature bit in the processor.
+ /// HasVectorUAMem - True if SIMD operations can have unaligned memory
+ /// operands. This may require setting a feature bit in the
+ /// processor.
bool HasVectorUAMem;
/// DarwinVers - Nonzero if this is a darwin platform: the numeric
@@ -150,20 +151,20 @@ public:
bool isTargetDarwin() const { return TargetType == isDarwin; }
bool isTargetELF() const { return TargetType == isELF; }
-
+
bool isTargetWindows() const { return TargetType == isWindows; }
bool isTargetMingw() const { return TargetType == isMingw; }
bool isTargetCygwin() const { return TargetType == isCygwin; }
bool isTargetCygMing() const {
return TargetType == isMingw || TargetType == isCygwin;
}
-
+
/// isTargetCOFF - Return true if this is any COFF/Windows target variant.
bool isTargetCOFF() const {
return TargetType == isMingw || TargetType == isCygwin ||
TargetType == isWindows;
}
-
+
bool isTargetWin64() const {
return Is64Bit && (TargetType == isMingw || TargetType == isWindows);
}
@@ -196,11 +197,11 @@ public:
bool isPICStyleStubAny() const {
return PICStyle == PICStyles::StubDynamicNoPIC ||
PICStyle == PICStyles::StubPIC; }
-
+
/// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard,
/// 10 = Snow Leopard, etc.
unsigned getDarwinVers() const { return DarwinVers; }
-
+
/// ClassifyGlobalReference - Classify a global variable reference for the
/// current subtarget according to how we should reference it in a non-pcrel
/// context.
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 7802f98..56ddaf8 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -51,6 +51,12 @@ extern "C" void LLVMInitializeX86Target() {
createX86_32MCCodeEmitter);
TargetRegistry::RegisterCodeEmitter(TheX86_64Target,
createX86_64MCCodeEmitter);
+
+ // Register the asm backend.
+ TargetRegistry::RegisterAsmBackend(TheX86_32Target,
+ createX86_32AsmBackend);
+ TargetRegistry::RegisterAsmBackend(TheX86_64Target,
+ createX86_64AsmBackend);
}
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index d1ee3fc..29a0be5 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -18,38 +18,6 @@
using namespace llvm;
using namespace dwarf;
-const MCExpr *X8632_MachoTargetObjectFile::
-getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
- MachineModuleInfo *MMI, unsigned Encoding) const {
- // The mach-o version of this method defaults to returning a stub reference.
-
- if (Encoding & DW_EH_PE_indirect) {
- MachineModuleInfoMachO &MachOMMI =
- MMI->getObjFileInfo<MachineModuleInfoMachO>();
-
- SmallString<128> Name;
- Mang->getNameWithPrefix(Name, GV, true);
- Name += "$non_lazy_ptr";
-
- // Add information about the stub reference to MachOMMI so that the stub
- // gets emitted by the asmprinter.
- MCSymbol *Sym = getContext().GetOrCreateSymbol(Name.str());
- MCSymbol *&StubSym = MachOMMI.getGVStubEntry(Sym);
- if (StubSym == 0) {
- Name.clear();
- Mang->getNameWithPrefix(Name, GV, false);
- StubSym = getContext().GetOrCreateSymbol(Name.str());
- }
-
- return TargetLoweringObjectFile::
- getSymbolForDwarfReference(Sym, MMI,
- Encoding & ~dwarf::DW_EH_PE_indirect);
- }
-
- return TargetLoweringObjectFileMachO::
- getSymbolForDwarfGlobalReference(GV, Mang, MMI, Encoding);
-}
-
const MCExpr *X8664_MachoTargetObjectFile::
getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
MachineModuleInfo *MMI, unsigned Encoding) const {
@@ -148,35 +116,3 @@ unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const {
return DW_EH_PE_absptr;
}
-
-unsigned X8632_MachoTargetObjectFile::getPersonalityEncoding() const {
- return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8632_MachoTargetObjectFile::getLSDAEncoding() const {
- return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8632_MachoTargetObjectFile::getFDEEncoding() const {
- return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8632_MachoTargetObjectFile::getTTypeEncoding() const {
- return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8664_MachoTargetObjectFile::getPersonalityEncoding() const {
- return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8664_MachoTargetObjectFile::getLSDAEncoding() const {
- return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8664_MachoTargetObjectFile::getFDEEncoding() const {
- return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned X8664_MachoTargetObjectFile::getTTypeEncoding() const {
- return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 0fff194..0444417 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -17,20 +17,6 @@
namespace llvm {
class X86TargetMachine;
- /// X8632_MachoTargetObjectFile - This TLOF implementation is used for
- /// Darwin/x86-32.
- class X8632_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
- public:
-
- virtual const MCExpr *
- getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
- MachineModuleInfo *MMI, unsigned Encoding) const;
- virtual unsigned getPersonalityEncoding() const;
- virtual unsigned getLSDAEncoding() const;
- virtual unsigned getFDEEncoding() const;
- virtual unsigned getTTypeEncoding() const;
- };
-
/// X8664_MachoTargetObjectFile - This TLOF implementation is used for
/// Darwin/x86-64.
class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
@@ -39,17 +25,13 @@ namespace llvm {
virtual const MCExpr *
getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
MachineModuleInfo *MMI, unsigned Encoding) const;
- virtual unsigned getPersonalityEncoding() const;
- virtual unsigned getLSDAEncoding() const;
- virtual unsigned getFDEEncoding() const;
- virtual unsigned getTTypeEncoding() const;
};
class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
const X86TargetMachine &TM;
public:
X8632_ELFTargetObjectFile(const X86TargetMachine &tm)
- :TM(tm) { };
+ :TM(tm) { }
virtual unsigned getPersonalityEncoding() const;
virtual unsigned getLSDAEncoding() const;
virtual unsigned getFDEEncoding() const;
@@ -60,7 +42,7 @@ namespace llvm {
const X86TargetMachine &TM;
public:
X8664_ELFTargetObjectFile(const X86TargetMachine &tm)
- :TM(tm) { };
+ :TM(tm) { }
virtual unsigned getPersonalityEncoding() const;
virtual unsigned getLSDAEncoding() const;
virtual unsigned getFDEEncoding() const;
OpenPOWER on IntegriCloud