diff options
Diffstat (limited to 'lib/Target/CellSPU')
30 files changed, 1367 insertions, 1117 deletions
diff --git a/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt b/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt deleted file mode 100644 index 8a2b59a..0000000 --- a/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -include_directories( - ${CMAKE_CURRENT_BINARY_DIR}/.. - ${CMAKE_CURRENT_SOURCE_DIR}/.. - ) - -add_llvm_library(LLVMCellSPUAsmPrinter - SPUAsmPrinter.cpp - ) -add_dependencies(LLVMCellSPUAsmPrinter CellSPUCodeGenTable_gen) diff --git a/lib/Target/CellSPU/AsmPrinter/Makefile b/lib/Target/CellSPU/AsmPrinter/Makefile deleted file mode 100644 index 4ec9d04..0000000 --- a/lib/Target/CellSPU/AsmPrinter/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -##===- lib/Target/CellSPU/AsmPrinter/Makefile --------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMCellSPUAsmPrinter - -# Hack: we need to include 'main' CellSPU target directory to grab -# private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt index ddfca37..a2a2ef1 100644 --- a/lib/Target/CellSPU/CMakeLists.txt +++ b/lib/Target/CellSPU/CMakeLists.txt @@ -12,16 +12,18 @@ tablegen(SPUGenSubtarget.inc -gen-subtarget) tablegen(SPUGenCallingConv.inc -gen-callingconv) add_llvm_target(CellSPUCodeGen - SPUFrameInfo.cpp + SPUAsmPrinter.cpp SPUHazardRecognizers.cpp SPUInstrInfo.cpp SPUISelDAGToDAG.cpp SPUISelLowering.cpp + SPUFrameLowering.cpp SPUMCAsmInfo.cpp SPURegisterInfo.cpp SPUSubtarget.cpp SPUTargetMachine.cpp SPUSelectionDAGInfo.cpp + SPUNopFiller.cpp ) -target_link_libraries (LLVMCellSPUCodeGen LLVMSelectionDAG) +add_subdirectory(TargetInfo) diff --git a/lib/Target/CellSPU/Makefile b/lib/Target/CellSPU/Makefile index cbdbd3c..77c66be 100644 --- a/lib/Target/CellSPU/Makefile +++ b/lib/Target/CellSPU/Makefile @@ -16,6 +16,6 @@ BUILT_SOURCES = SPUGenInstrNames.inc SPUGenRegisterNames.inc \ SPUGenInstrInfo.inc SPUGenDAGISel.inc \ SPUGenSubtarget.inc SPUGenCallingConv.inc -DIRS = AsmPrinter TargetInfo +DIRS = TargetInfo include $(LEVEL)/Makefile.common diff --git a/lib/Target/CellSPU/README.txt b/lib/Target/CellSPU/README.txt index 0e7ad35..3e7e0b6 100644 --- a/lib/Target/CellSPU/README.txt +++ b/lib/Target/CellSPU/README.txt @@ -55,7 +55,7 @@ TODO: * i128 support: * zero extension, any extension: done - * sign extension: needed + * sign extension: done * arithmetic operators (add, sub, mul, div): needed * logical operations (and, or, shl, srl, sra, xor, nor, nand): needed diff --git a/lib/Target/CellSPU/SPU.h b/lib/Target/CellSPU/SPU.h index 1f21511..72f8430 100644 --- a/lib/Target/CellSPU/SPU.h +++ b/lib/Target/CellSPU/SPU.h @@ -23,6 +23,7 @@ namespace llvm { class formatted_raw_ostream; FunctionPass *createSPUISelDag(SPUTargetMachine &TM); + FunctionPass *createSPUNopFillerPass(SPUTargetMachine &tm); extern Target TheCellSPUTarget; } diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td index 069a182..5ef5716 100644 --- a/lib/Target/CellSPU/SPU64InstrInfo.td +++ b/lib/Target/CellSPU/SPU64InstrInfo.td @@ -54,8 +54,8 @@ class I64SETCCNegCond<PatFrag cond, CodeFrag compare>: // The i64 seteq fragment that does the scalar->vector conversion and // comparison: def CEQr64compare: - CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 (ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB))), 0xb)>; + CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))), 0xb)>; // The i64 seteq fragment that does the vector comparison def CEQv2i64compare: @@ -67,12 +67,14 @@ def CEQv2i64compare: // v2i64 seteq (equality): the setcc result is v4i32 multiclass CompareEqual64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CEQr64compare.Fragment)>; - def v2i64: CodeFrag<(ORi32_v4i32 CEQv2i64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CEQr64compare.Fragment, R32C))>; + def v2i64: CodeFrag<(i32 (COPY_TO_REGCLASS CEQv2i64compare.Fragment, R32C))>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CEQr64compare.Fragment), R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CEQv2i64compare.Fragment), R32C))>; } defm I64EQ: CompareEqual64; @@ -89,10 +91,12 @@ def : I64SELECTNegCond<setne, I64EQr64>; //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ def CLGTr64ugt: - CodeFrag<(CLGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>; + CodeFrag<(CLGTv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))>; def CLGTr64eq: - CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>; + CodeFrag<(CEQv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))>; def CLGTr64compare: CodeFrag<(SELBv2i64 CLGTr64ugt.Fragment, @@ -112,12 +116,14 @@ def CLGTv2i64compare: multiclass CompareLogicalGreaterThan64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CLGTr64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CLGTr64compare.Fragment, R32C))>; def v2i64: CodeFrag<CLGTv2i64compare.Fragment>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CLGTr64compare.Fragment), R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CLGTv2i64compare.Fragment), R32C))>; } defm I64LGT: CompareLogicalGreaterThan64; @@ -144,12 +150,14 @@ def CLGEv2i64compare: multiclass CompareLogicalGreaterEqual64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CLGEr64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CLGEr64compare.Fragment, R32C))>; def v2i64: CodeFrag<CLGEv2i64compare.Fragment>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CLGEr64compare.Fragment), R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CLGEv2i64compare.Fragment),R32C))>; } defm I64LGE: CompareLogicalGreaterEqual64; @@ -168,10 +176,12 @@ def : I64SELECTNegCond<setult, I64LGEr64>; //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ def CGTr64sgt: - CodeFrag<(CGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>; + CodeFrag<(CGTv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))>; def CGTr64eq: - CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>; + CodeFrag<(CEQv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))>; def CGTr64compare: CodeFrag<(SELBv2i64 CGTr64sgt.Fragment, @@ -191,12 +201,14 @@ def CGTv2i64compare: multiclass CompareGreaterThan64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CGTr64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CGTr64compare.Fragment, R32C))>; def v2i64: CodeFrag<CGTv2i64compare.Fragment>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CGTr64compare.Fragment), R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CGTv2i64compare.Fragment), R32C))>; } defm I64GT: CompareLogicalGreaterThan64; @@ -223,12 +235,12 @@ def CGEv2i64compare: multiclass CompareGreaterEqual64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CGEr64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CGEr64compare.Fragment, R32C))>; def v2i64: CodeFrag<CGEv2i64compare.Fragment>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS (FSMv4i32 CGEr64compare.Fragment),R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS (FSMv4i32 CGEv2i64compare.Fragment),R32C))>; } defm I64GE: CompareGreaterEqual64; @@ -255,9 +267,9 @@ class v2i64_add<dag lhs, dag rhs, dag cg_mask>: v2i64_add_1<lhs, rhs, v2i64_add_cg<lhs, rhs>.Fragment, cg_mask>; def : Pat<(SPUadd64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)), - (ORi64_v2i64 v2i64_add<(ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB), - (v4i32 VECREG:$rCGmask)>.Fragment)>; + (COPY_TO_REGCLASS v2i64_add<(COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG), + (v4i32 VECREG:$rCGmask)>.Fragment, R64C)>; def : Pat<(SPUadd64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB), (v4i32 VECREG:$rCGmask)), @@ -275,11 +287,12 @@ class v2i64_sub<dag lhs, dag rhs, dag bg, dag bg_mask>: CodeFrag<(SFXv4i32 lhs, rhs, (SHUFBv4i32 bg, bg, bg_mask))>; def : Pat<(SPUsub64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)), - (ORi64_v2i64 v2i64_sub<(ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB), - v2i64_sub_bg<(ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB)>.Fragment, - (v4i32 VECREG:$rCGmask)>.Fragment)>; + (COPY_TO_REGCLASS + v2i64_sub<(COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG), + v2i64_sub_bg<(COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG)>.Fragment, + (v4i32 VECREG:$rCGmask)>.Fragment, R64C)>; def : Pat<(SPUsub64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB), (v4i32 VECREG:$rCGmask)), @@ -374,9 +387,9 @@ class v2i64_mul<dag rA, dag rB, dag rCGmask>: rCGmask>; def : Pat<(SPUmul64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)), - (ORi64_v2i64 v2i64_mul<(ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB), - (v4i32 VECREG:$rCGmask)>.Fragment)>; + (COPY_TO_REGCLASS v2i64_mul<(COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG), + (v4i32 VECREG:$rCGmask)>.Fragment, R64C)>; def : Pat<(SPUmul64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB), (v4i32 VECREG:$rCGmask)), diff --git a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp index 3e95531..4040461 100644 --- a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp +++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp @@ -46,10 +46,6 @@ namespace { return "STI CBEA SPU Assembly Printer"; } - SPUTargetMachine &getTM() { - return static_cast<SPUTargetMachine&>(TM); - } - /// printInstruction - This method is automatically generated by tablegen /// from the instruction set description. void printInstruction(const MachineInstr *MI, raw_ostream &OS); @@ -64,15 +60,6 @@ namespace { } void printOp(const MachineOperand &MO, raw_ostream &OS); - /// printRegister - Print register according to target requirements. - /// - void printRegister(const MachineOperand &MO, bool R0AsZero, raw_ostream &O){ - unsigned RegNo = MO.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(RegNo) && - "Not physreg??"); - O << getRegisterName(RegNo); - } - void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNo); if (MO.isReg()) { @@ -93,17 +80,6 @@ namespace { void - printS7ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) - { - int value = MI->getOperand(OpNo).getImm(); - value = (value << (32 - 7)) >> (32 - 7); - - assert((value >= -(1 << 8) && value <= (1 << 7) - 1) - && "Invalid s7 argument"); - O << value; - } - - void printU7ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { unsigned int value = MI->getOperand(OpNo).getImm(); @@ -134,12 +110,6 @@ namespace { } void - printU32ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) - { - O << (unsigned)MI->getOperand(OpNo).getImm(); - } - - void printMemRegReg(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { // When used as the base register, r0 reads constant zero rather than // the value contained in the register. For this reason, the darwin @@ -221,13 +191,6 @@ namespace { printOp(MI->getOperand(OpNo), O); } - void printHBROperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { - // HBR operands are generated in front of branches, hence, the - // program counter plus the target. - O << ".+"; - printOp(MI->getOperand(OpNo), O); - } - void printSymbolHi(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { if (MI->getOperand(OpNo).isImm()) { printS16ImmOperand(MI, OpNo, O); diff --git a/lib/Target/CellSPU/SPUFrameInfo.cpp b/lib/Target/CellSPU/SPUFrameInfo.cpp deleted file mode 100644 index 60d7ba7..0000000 --- a/lib/Target/CellSPU/SPUFrameInfo.cpp +++ /dev/null @@ -1,29 +0,0 @@ -//===-- SPUTargetMachine.cpp - Define TargetMachine for Cell SPU ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Top-level implementation for the Cell SPU target. -// -//===----------------------------------------------------------------------===// - -#include "SPU.h" -#include "SPUFrameInfo.h" -#include "SPURegisterNames.h" -using namespace llvm; - -//===----------------------------------------------------------------------===// -// SPUFrameInfo: -//===----------------------------------------------------------------------===// - -SPUFrameInfo::SPUFrameInfo(const TargetMachine &tm): - TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0), - TM(tm) -{ - LR[0].first = SPU::R0; - LR[0].second = 16; -} diff --git a/lib/Target/CellSPU/SPUFrameLowering.cpp b/lib/Target/CellSPU/SPUFrameLowering.cpp new file mode 100644 index 0000000..432f4a1 --- /dev/null +++ b/lib/Target/CellSPU/SPUFrameLowering.cpp @@ -0,0 +1,276 @@ +//===-- SPUTargetMachine.cpp - Define TargetMachine for Cell SPU ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Top-level implementation for the Cell SPU target. +// +//===----------------------------------------------------------------------===// + +#include "SPU.h" +#include "SPUFrameLowering.h" +#include "SPURegisterNames.h" +#include "SPUInstrBuilder.h" +#include "SPUInstrInfo.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// SPUFrameLowering: +//===----------------------------------------------------------------------===// + +SPUFrameLowering::SPUFrameLowering(const SPUSubtarget &sti) + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 16, 0), + Subtarget(sti) { + LR[0].first = SPU::R0; + LR[0].second = 16; +} + + +//-------------------------------------------------------------------------- +// hasFP - Return true if the specified function actually has a dedicated frame +// pointer register. This is true if the function needs a frame pointer and has +// a non-zero stack size. +bool SPUFrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + return MFI->getStackSize() && + (DisableFramePointerElim(MF) || MFI->hasVarSizedObjects()); +} + + +/// determineFrameLayout - Determine the size of the frame and maximum call +/// frame size. +void SPUFrameLowering::determineFrameLayout(MachineFunction &MF) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Get the number of bytes to allocate from the FrameInfo + unsigned FrameSize = MFI->getStackSize(); + + // Get the alignments provided by the target, and the maximum alignment + // (if any) of the fixed frame objects. + unsigned TargetAlign = getStackAlignment(); + unsigned Align = std::max(TargetAlign, MFI->getMaxAlignment()); + assert(isPowerOf2_32(Align) && "Alignment is not power of 2"); + unsigned AlignMask = Align - 1; + + // Get the maximum call frame size of all the calls. + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + + // If we have dynamic alloca then maxCallFrameSize needs to be aligned so + // that allocations will be aligned. + if (MFI->hasVarSizedObjects()) + maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask; + + // Update maximum call frame size. + MFI->setMaxCallFrameSize(maxCallFrameSize); + + // Include call frame size in total. + FrameSize += maxCallFrameSize; + + // Make sure the frame is aligned. + FrameSize = (FrameSize + AlignMask) & ~AlignMask; + + // Update frame info. + MFI->setStackSize(FrameSize); +} + +void SPUFrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const SPUInstrInfo &TII = + *static_cast<const SPUInstrInfo*>(MF.getTarget().getInstrInfo()); + MachineModuleInfo &MMI = MF.getMMI(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Prepare for debug frame info. + bool hasDebugInfo = MMI.hasDebugInfo(); + MCSymbol *FrameLabel = 0; + + // Move MBBI back to the beginning of the function. + MBBI = MBB.begin(); + + // Work out frame sizes. + determineFrameLayout(MF); + int FrameSize = MFI->getStackSize(); + + assert((FrameSize & 0xf) == 0 + && "SPURegisterInfo::emitPrologue: FrameSize not aligned"); + + // the "empty" frame size is 16 - just the register scavenger spill slot + if (FrameSize > 16 || MFI->adjustsStack()) { + FrameSize = -(FrameSize + SPUFrameLowering::minStackSize()); + if (hasDebugInfo) { + // Mark effective beginning of when frame pointer becomes valid. + FrameLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(FrameLabel); + } + + // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp) + // for the ABI + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16) + .addReg(SPU::R1); + if (isInt<10>(FrameSize)) { + // Spill $sp to adjusted $sp + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize) + .addReg(SPU::R1); + // Adjust $sp by required amout + BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1) + .addImm(FrameSize); + } else if (isInt<16>(FrameSize)) { + // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use + // $r2 to adjust $sp: + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2) + .addImm(-16) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2) + .addImm(FrameSize); + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQXr32), SPU::R1) + .addReg(SPU::R2) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1) + .addReg(SPU::R1) + .addReg(SPU::R2); + BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2) + .addReg(SPU::R2) + .addImm(16); + BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2) + .addReg(SPU::R2) + .addReg(SPU::R1); + } else { + report_fatal_error("Unhandled frame size: " + Twine(FrameSize)); + } + + if (hasDebugInfo) { + std::vector<MachineMove> &Moves = MMI.getFrameMoves(); + + // Show update of SP. + MachineLocation SPDst(MachineLocation::VirtualFP); + MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize); + Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx()); + unsigned Reg = CSI[I].getReg(); + if (Reg == SPU::R0) continue; + MachineLocation CSDst(MachineLocation::VirtualFP, Offset); + MachineLocation CSSrc(Reg); + Moves.push_back(MachineMove(FrameLabel, CSDst, CSSrc)); + } + + // Mark effective beginning of when frame pointer is ready. + MCSymbol *ReadyLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(ReadyLabel); + + MachineLocation FPDst(SPU::R1); + MachineLocation FPSrc(MachineLocation::VirtualFP); + Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc)); + } + } else { + // This is a leaf function -- insert a branch hint iff there are + // sufficient number instructions in the basic block. Note that + // this is just a best guess based on the basic block's size. + if (MBB.size() >= (unsigned) SPUFrameLowering::branchHintPenalty()) { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + dl = MBBI->getDebugLoc(); + + // Insert terminator label + BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)) + .addSym(MMI.getContext().CreateTempSymbol()); + } + } +} + +void SPUFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + const SPUInstrInfo &TII = + *static_cast<const SPUInstrInfo*>(MF.getTarget().getInstrInfo()); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int FrameSize = MFI->getStackSize(); + int LinkSlotOffset = SPUFrameLowering::stackSlotSize(); + DebugLoc dl = MBBI->getDebugLoc(); + + assert(MBBI->getOpcode() == SPU::RET && + "Can only insert epilog into returning blocks"); + assert((FrameSize & 0xf) == 0 && "FrameSize not aligned"); + + // the "empty" frame size is 16 - just the register scavenger spill slot + if (FrameSize > 16 || MFI->adjustsStack()) { + FrameSize = FrameSize + SPUFrameLowering::minStackSize(); + if (isInt<10>(FrameSize + LinkSlotOffset)) { + // Reload $lr, adjust $sp by required amount + // Note: We do this to slightly improve dual issue -- not by much, but it + // is an opportunity for dual issue. + BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0) + .addImm(FrameSize + LinkSlotOffset) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1) + .addReg(SPU::R1) + .addImm(FrameSize); + } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) { + // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use + // $r2 to adjust $sp: + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2) + .addImm(16) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2) + .addImm(FrameSize); + BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1) + .addReg(SPU::R1) + .addReg(SPU::R2); + BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0) + .addImm(16) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2). + addReg(SPU::R2) + .addImm(16); + BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2) + .addReg(SPU::R2) + .addReg(SPU::R1); + } else { + report_fatal_error("Unhandled frame size: " + Twine(FrameSize)); + } + } +} + +void SPUFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) + const { + // Initial state of the frame pointer is R1. + MachineLocation Dst(MachineLocation::VirtualFP); + MachineLocation Src(SPU::R1, 0); + Moves.push_back(MachineMove(0, Dst, Src)); +} + +void SPUFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const{ + // Mark LR and SP unused, since the prolog spills them to stack and + // we don't want anyone else to spill them for us. + // + // Also, unless R2 is really used someday, don't spill it automatically. + MF.getRegInfo().setPhysRegUnused(SPU::R0); + MF.getRegInfo().setPhysRegUnused(SPU::R1); + MF.getRegInfo().setPhysRegUnused(SPU::R2); + + MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetRegisterClass *RC = &SPU::R32CRegClass; + RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); +} diff --git a/lib/Target/CellSPU/SPUFrameInfo.h b/lib/Target/CellSPU/SPUFrameLowering.h index f511acd..4fee72d 100644 --- a/lib/Target/CellSPU/SPUFrameInfo.h +++ b/lib/Target/CellSPU/SPUFrameLowering.h @@ -1,4 +1,4 @@ -//===-- SPUFrameInfo.h - Top-level interface for Cell SPU Target -*- C++ -*-==// +//=====-- SPUFrameLowering.h - SPU Frame Lowering stuff -*- C++ -*----========// // // The LLVM Compiler Infrastructure // @@ -12,19 +12,39 @@ // //===----------------------------------------------------------------------===// -#if !defined(SPUFRAMEINFO_H) +#ifndef SPU_FRAMEINFO_H +#define SPU_FRAMEINFO_H -#include "llvm/Target/TargetFrameInfo.h" -#include "llvm/Target/TargetMachine.h" #include "SPURegisterInfo.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" namespace llvm { - class SPUFrameInfo: public TargetFrameInfo { - const TargetMachine &TM; + class SPUSubtarget; + + class SPUFrameLowering: public TargetFrameLowering { + const SPUSubtarget &Subtarget; std::pair<unsigned, int> LR[1]; public: - SPUFrameInfo(const TargetMachine &tm); + SPUFrameLowering(const SPUSubtarget &sti); + + //! Determine the frame's layour + void determineFrameLayout(MachineFunction &MF) const; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + //! Prediate: Target has dedicated frame pointer + bool hasFP(const MachineFunction &MF) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = NULL) const; + + //! Perform target-specific stack frame setup. + void getInitialFrameState(std::vector<MachineMove> &Moves) const; //! Return a function's saved spill slots /*! @@ -71,5 +91,4 @@ namespace llvm { }; } -#define SPUFRAMEINFO_H 1 #endif diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.cpp b/lib/Target/CellSPU/SPUHazardRecognizers.cpp index 9dbab1d..403d7ef 100644 --- a/lib/Target/CellSPU/SPUHazardRecognizers.cpp +++ b/lib/Target/CellSPU/SPUHazardRecognizers.cpp @@ -41,12 +41,14 @@ SPUHazardRecognizer::SPUHazardRecognizer(const TargetInstrInfo &tii) : /// /// \return NoHazard ScheduleHazardRecognizer::HazardType -SPUHazardRecognizer::getHazardType(SUnit *SU) +SPUHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { // Initial thoughts on how to do this, but this code cannot work unless the // function's prolog and epilog code are also being scheduled so that we can // accurately determine which pipeline is being scheduled. #if 0 + assert(Stalls == 0 && "SPU hazards don't yet support scoreboard lookahead"); + const SDNode *Node = SU->getNode()->getFlaggedMachineNode(); ScheduleHazardRecognizer::HazardType retval = NoHazard; bool mustBeOdd = false; diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.h b/lib/Target/CellSPU/SPUHazardRecognizers.h index d0ae2d8..675632c 100644 --- a/lib/Target/CellSPU/SPUHazardRecognizers.h +++ b/lib/Target/CellSPU/SPUHazardRecognizers.h @@ -20,7 +20,7 @@ namespace llvm { class TargetInstrInfo; - + /// SPUHazardRecognizer class SPUHazardRecognizer : public ScheduleHazardRecognizer { @@ -30,7 +30,7 @@ private: public: SPUHazardRecognizer(const TargetInstrInfo &TII); - virtual HazardType getHazardType(SUnit *SU); + virtual HazardType getHazardType(SUnit *SU, int Stalls); virtual void EmitInstruction(SUnit *SU); virtual void AdvanceCycle(); virtual void EmitNoop(); diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp index 2f15984..d226156 100644 --- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp +++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp @@ -15,7 +15,7 @@ #include "SPU.h" #include "SPUTargetMachine.h" #include "SPUHazardRecognizers.h" -#include "SPUFrameInfo.h" +#include "SPUFrameLowering.h" #include "SPURegisterNames.h" #include "SPUTargetMachine.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -111,55 +111,6 @@ namespace { return false; } - //===------------------------------------------------------------------===// - //! EVT to "useful stuff" mapping structure: - - struct valtype_map_s { - EVT VT; - unsigned ldresult_ins; /// LDRESULT instruction (0 = undefined) - bool ldresult_imm; /// LDRESULT instruction requires immediate? - unsigned lrinst; /// LR instruction - }; - - const valtype_map_s valtype_map[] = { - { MVT::i8, SPU::ORBIr8, true, SPU::LRr8 }, - { MVT::i16, SPU::ORHIr16, true, SPU::LRr16 }, - { MVT::i32, SPU::ORIr32, true, SPU::LRr32 }, - { MVT::i64, SPU::ORr64, false, SPU::LRr64 }, - { MVT::f32, SPU::ORf32, false, SPU::LRf32 }, - { MVT::f64, SPU::ORf64, false, SPU::LRf64 }, - // vector types... (sigh!) - { MVT::v16i8, 0, false, SPU::LRv16i8 }, - { MVT::v8i16, 0, false, SPU::LRv8i16 }, - { MVT::v4i32, 0, false, SPU::LRv4i32 }, - { MVT::v2i64, 0, false, SPU::LRv2i64 }, - { MVT::v4f32, 0, false, SPU::LRv4f32 }, - { MVT::v2f64, 0, false, SPU::LRv2f64 } - }; - - const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]); - - const valtype_map_s *getValueTypeMapEntry(EVT VT) - { - const valtype_map_s *retval = 0; - for (size_t i = 0; i < n_valtype_map; ++i) { - if (valtype_map[i].VT == VT) { - retval = valtype_map + i; - break; - } - } - - -#ifndef NDEBUG - if (retval == 0) { - report_fatal_error("SPUISelDAGToDAG.cpp: getValueTypeMapEntry returns" - "NULL for " + Twine(VT.getEVTString())); - } -#endif - - return retval; - } - //! Generate the carry-generate shuffle mask. SDValue getCarryGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) { SmallVector<SDValue, 16 > ShufBytes; @@ -221,16 +172,10 @@ namespace { return CurDAG->getTargetConstant(Imm, MVT::i32); } - /// getI64Imm - Return a target constant with the specified value, of type - /// i64. - inline SDValue getI64Imm(uint64_t Imm) { - return CurDAG->getTargetConstant(Imm, MVT::i64); - } - /// getSmallIPtrImm - Return a target constant of pointer type. inline SDValue getSmallIPtrImm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy()); - } + } SDNode *emitBuildVector(SDNode *bvNode) { EVT vecVT = bvNode->getValueType(0); @@ -268,10 +213,10 @@ namespace { unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); SDValue CGPoolOffset = SPU::LowerConstantPool(CPIdx, *CurDAG, TM); - + HandleSDNode Dummy(CurDAG->getLoad(vecVT, dl, CurDAG->getEntryNode(), CGPoolOffset, - PseudoSourceValue::getConstantPool(),0, + MachinePointerInfo::getConstantPool(), false, false, Alignment)); CurDAG->ReplaceAllUsesWith(SDValue(bvNode, 0), Dummy.getValue()); if (SDNode *N = SelectCode(Dummy.getValue().getNode())) @@ -356,13 +301,8 @@ namespace { return "Cell SPU DAG->DAG Pattern Instruction Selection"; } - /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for - /// this target when scheduling the DAG. - virtual ScheduleHazardRecognizer *CreateTargetHazardRecognizer() { - const TargetInstrInfo *II = TM.getInstrInfo(); - assert(II && "No InstrInfo?"); - return new SPUHazardRecognizer(*II); - } + private: + SDValue getRC( MVT ); // Include the pieces autogenerated from the target description. #include "SPUGenDAGISel.inc" @@ -450,8 +390,8 @@ bool SPUDAGToDAGISel::SelectDFormAddr(SDNode *Op, SDValue N, SDValue &Base, SDValue &Index) { return DFormAddressPredicate(Op, N, Base, Index, - SPUFrameInfo::minFrameOffset(), - SPUFrameInfo::maxFrameOffset()); + SPUFrameLowering::minFrameOffset(), + SPUFrameLowering::maxFrameOffset()); } bool @@ -467,7 +407,7 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base, int FI = int(FIN->getIndex()); DEBUG(errs() << "SelectDFormAddr: ISD::FrameIndex = " << FI << "\n"); - if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) { + if (SPUFrameLowering::FItoStackOffset(FI) < maxOffset) { Base = CurDAG->getTargetConstant(0, PtrTy); Index = CurDAG->getTargetFrameIndex(FI, PtrTy); return true; @@ -493,7 +433,7 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base, DEBUG(errs() << "SelectDFormAddr: ISD::ADD offset = " << offset << " frame index = " << FI << "\n"); - if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) { + if (SPUFrameLowering::FItoStackOffset(FI) < maxOffset) { Base = CurDAG->getTargetConstant(offset, PtrTy); Index = CurDAG->getTargetFrameIndex(FI, PtrTy); return true; @@ -514,7 +454,7 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base, DEBUG(errs() << "SelectDFormAddr: ISD::ADD offset = " << offset << " frame index = " << FI << "\n"); - if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) { + if (SPUFrameLowering::FItoStackOffset(FI) < maxOffset) { Base = CurDAG->getTargetConstant(offset, PtrTy); Index = CurDAG->getTargetFrameIndex(FI, PtrTy); return true; @@ -564,8 +504,8 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base, Base = CurDAG->getTargetConstant(0, N.getValueType()); Index = N; return true; - } else if (Opc == ISD::Register - ||Opc == ISD::CopyFromReg + } else if (Opc == ISD::Register + ||Opc == ISD::CopyFromReg ||Opc == ISD::UNDEF ||Opc == ISD::Constant) { unsigned OpOpc = Op->getOpcode(); @@ -625,6 +565,46 @@ SPUDAGToDAGISel::SelectXFormAddr(SDNode *Op, SDValue N, SDValue &Base, return false; } +/*! + Utility function to use with COPY_TO_REGCLASS instructions. Returns a SDValue + to be used as the last parameter of a +CurDAG->getMachineNode(COPY_TO_REGCLASS,..., ) function call + \arg VT the value type for which we want a register class +*/ +SDValue SPUDAGToDAGISel::getRC( MVT VT ) { + switch( VT.SimpleTy ) { + case MVT::i8: + return CurDAG->getTargetConstant(SPU::R8CRegClass.getID(), MVT::i32); + break; + case MVT::i16: + return CurDAG->getTargetConstant(SPU::R16CRegClass.getID(), MVT::i32); + break; + case MVT::i32: + return CurDAG->getTargetConstant(SPU::R32CRegClass.getID(), MVT::i32); + break; + case MVT::f32: + return CurDAG->getTargetConstant(SPU::R32FPRegClass.getID(), MVT::i32); + break; + case MVT::i64: + return CurDAG->getTargetConstant(SPU::R64CRegClass.getID(), MVT::i32); + break; + case MVT::i128: + return CurDAG->getTargetConstant(SPU::GPRCRegClass.getID(), MVT::i32); + break; + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v4f32: + case MVT::v2i64: + case MVT::v2f64: + return CurDAG->getTargetConstant(SPU::VECREGRegClass.getID(), MVT::i32); + break; + default: + assert( false && "add a new case here" ); + } + return SDValue(); +} + //! Convert the operand from a target-independent to a target-specific node /*! */ @@ -632,7 +612,7 @@ SDNode * SPUDAGToDAGISel::Select(SDNode *N) { unsigned Opc = N->getOpcode(); int n_ops = -1; - unsigned NewOpc; + unsigned NewOpc = 0; EVT OpVT = N->getValueType(0); SDValue Ops[8]; DebugLoc dl = N->getDebugLoc(); @@ -654,7 +634,7 @@ SPUDAGToDAGISel::Select(SDNode *N) { NewOpc = SPU::Ar32; Ops[0] = CurDAG->getRegister(SPU::R1, N->getValueType(0)); Ops[1] = SDValue(CurDAG->getMachineNode(SPU::ILAr32, dl, - N->getValueType(0), TFI, Imm0), + N->getValueType(0), TFI), 0); n_ops = 2; } @@ -669,7 +649,7 @@ SPUDAGToDAGISel::Select(SDNode *N) { EVT Op0VT = Op0.getValueType(); EVT Op0VecVT = EVT::getVectorVT(*CurDAG->getContext(), Op0VT, (128 / Op0VT.getSizeInBits())); - EVT OpVecVT = EVT::getVectorVT(*CurDAG->getContext(), + EVT OpVecVT = EVT::getVectorVT(*CurDAG->getContext(), OpVT, (128 / OpVT.getSizeInBits())); SDValue shufMask; @@ -703,19 +683,19 @@ SPUDAGToDAGISel::Select(SDNode *N) { } SDNode *shufMaskLoad = emitBuildVector(shufMask.getNode()); - + HandleSDNode PromoteScalar(CurDAG->getNode(SPUISD::PREFSLOT2VEC, dl, Op0VecVT, Op0)); - + SDValue PromScalar; if (SDNode *N = SelectCode(PromoteScalar.getValue().getNode())) PromScalar = SDValue(N, 0); else PromScalar = PromoteScalar.getValue(); - + SDValue zextShuffle = CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT, - PromScalar, PromScalar, + PromScalar, PromScalar, SDValue(shufMaskLoad, 0)); HandleSDNode Dummy2(zextShuffle); @@ -725,7 +705,7 @@ SPUDAGToDAGISel::Select(SDNode *N) { zextShuffle = Dummy2.getValue(); HandleSDNode Dummy(CurDAG->getNode(SPUISD::VEC2PREFSLOT, dl, OpVT, zextShuffle)); - + CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode()); SelectCode(Dummy.getValue().getNode()); return Dummy.getValue().getNode(); @@ -736,7 +716,7 @@ SPUDAGToDAGISel::Select(SDNode *N) { HandleSDNode Dummy(CurDAG->getNode(SPUISD::ADD64_MARKER, dl, OpVT, N->getOperand(0), N->getOperand(1), SDValue(CGLoad, 0))); - + CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode()); if (SDNode *N = SelectCode(Dummy.getValue().getNode())) return N; @@ -748,7 +728,7 @@ SPUDAGToDAGISel::Select(SDNode *N) { HandleSDNode Dummy(CurDAG->getNode(SPUISD::SUB64_MARKER, dl, OpVT, N->getOperand(0), N->getOperand(1), SDValue(CGLoad, 0))); - + CurDAG->ReplaceAllUsesWith(N, Dummy.getValue().getNode()); if (SDNode *N = SelectCode(Dummy.getValue().getNode())) return N; @@ -779,8 +759,8 @@ SPUDAGToDAGISel::Select(SDNode *N) { if (shift_amt >= 32) { SDNode *hi32 = - CurDAG->getMachineNode(SPU::ORr32_r64, dl, OpVT, - Op0.getOperand(0)); + CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, OpVT, + Op0.getOperand(0), getRC(MVT::i32)); shift_amt -= 32; if (shift_amt > 0) { @@ -862,23 +842,12 @@ SPUDAGToDAGISel::Select(SDNode *N) { SDValue Arg = N->getOperand(0); SDValue Chain = N->getOperand(1); SDNode *Result; - const valtype_map_s *vtm = getValueTypeMapEntry(VT); - - if (vtm->ldresult_ins == 0) { - report_fatal_error("LDRESULT for unsupported type: " + - Twine(VT.getEVTString())); - } - - Opc = vtm->ldresult_ins; - if (vtm->ldresult_imm) { - SDValue Zero = CurDAG->getTargetConstant(0, VT); - - Result = CurDAG->getMachineNode(Opc, dl, VT, MVT::Other, Arg, Zero, Chain); - } else { - Result = CurDAG->getMachineNode(Opc, dl, VT, MVT::Other, Arg, Arg, Chain); - } + Result = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, VT, + MVT::Other, Arg, + getRC( VT.getSimpleVT()), Chain); return Result; + } else if (Opc == SPUISD::IndirectAddr) { // Look at the operands: SelectCode() will catch the cases that aren't // specifically handled here. @@ -904,10 +873,10 @@ SPUDAGToDAGISel::Select(SDNode *N) { NewOpc = SPU::AIr32; Ops[1] = Op1; } else { - Ops[1] = SDValue(CurDAG->getMachineNode(SPU::ILr32, dl, - N->getValueType(0), + Ops[1] = SDValue(CurDAG->getMachineNode(SPU::ILr32, dl, + N->getValueType(0), Op1), - 0); + 0); } } Ops[0] = Op0; @@ -939,7 +908,7 @@ SPUDAGToDAGISel::Select(SDNode *N) { SDNode * SPUDAGToDAGISel::SelectSHLi64(SDNode *N, EVT OpVT) { SDValue Op0 = N->getOperand(0); - EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), + EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), OpVT, (128 / OpVT.getSizeInBits())); SDValue ShiftAmt = N->getOperand(1); EVT ShiftAmtVT = ShiftAmt.getValueType(); @@ -947,7 +916,8 @@ SPUDAGToDAGISel::SelectSHLi64(SDNode *N, EVT OpVT) { SDValue SelMaskVal; DebugLoc dl = N->getDebugLoc(); - VecOp0 = CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, Op0); + VecOp0 = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, VecVT, + Op0, getRC(MVT::v2i64) ); SelMaskVal = CurDAG->getTargetConstant(0xff00ULL, MVT::i16); SelMask = CurDAG->getMachineNode(SPU::FSMBIv2i64, dl, VecVT, SelMaskVal); ZeroFill = CurDAG->getMachineNode(SPU::ILv2i64, dl, VecVT, @@ -991,7 +961,8 @@ SPUDAGToDAGISel::SelectSHLi64(SDNode *N, EVT OpVT) { SDValue(Shift, 0), SDValue(Bits, 0)); } - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + OpVT, SDValue(Shift, 0), getRC(MVT::i64)); } /*! @@ -1012,7 +983,8 @@ SPUDAGToDAGISel::SelectSRLi64(SDNode *N, EVT OpVT) { SDNode *VecOp0, *Shift = 0; DebugLoc dl = N->getDebugLoc(); - VecOp0 = CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, Op0); + VecOp0 = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, VecVT, + Op0, getRC(MVT::v2i64) ); if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) { unsigned bytes = unsigned(CN->getZExtValue()) >> 3; @@ -1058,7 +1030,8 @@ SPUDAGToDAGISel::SelectSRLi64(SDNode *N, EVT OpVT) { SDValue(Shift, 0), SDValue(Bits, 0)); } - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + OpVT, SDValue(Shift, 0), getRC(MVT::i64)); } /*! @@ -1072,21 +1045,23 @@ SPUDAGToDAGISel::SelectSRLi64(SDNode *N, EVT OpVT) { SDNode * SPUDAGToDAGISel::SelectSRAi64(SDNode *N, EVT OpVT) { // Promote Op0 to vector - EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), + EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), OpVT, (128 / OpVT.getSizeInBits())); SDValue ShiftAmt = N->getOperand(1); EVT ShiftAmtVT = ShiftAmt.getValueType(); DebugLoc dl = N->getDebugLoc(); SDNode *VecOp0 = - CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, N->getOperand(0)); + CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + VecVT, N->getOperand(0), getRC(MVT::v2i64)); SDValue SignRotAmt = CurDAG->getTargetConstant(31, ShiftAmtVT); SDNode *SignRot = CurDAG->getMachineNode(SPU::ROTMAIv2i64_i32, dl, MVT::v2i64, SDValue(VecOp0, 0), SignRotAmt); SDNode *UpperHalfSign = - CurDAG->getMachineNode(SPU::ORi32_v4i32, dl, MVT::i32, SDValue(SignRot, 0)); + CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + MVT::i32, SDValue(SignRot, 0), getRC(MVT::i32)); SDNode *UpperHalfSignMask = CurDAG->getMachineNode(SPU::FSM64r32, dl, VecVT, SDValue(UpperHalfSign, 0)); @@ -1133,7 +1108,8 @@ SPUDAGToDAGISel::SelectSRAi64(SDNode *N, EVT OpVT) { SDValue(Shift, 0), SDValue(NegShift, 0)); } - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + OpVT, SDValue(Shift, 0), getRC(MVT::i64)); } /*! @@ -1154,20 +1130,21 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT, // Here's where it gets interesting, because we have to parse out the // subtree handed back in i64vec: - if (i64vec.getOpcode() == ISD::BIT_CONVERT) { + if (i64vec.getOpcode() == ISD::BITCAST) { // The degenerate case where the upper and lower bits in the splat are // identical: SDValue Op0 = i64vec.getOperand(0); ReplaceUses(i64vec, Op0); - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, - SDValue(emitBuildVector(Op0.getNode()), 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, OpVT, + SDValue(emitBuildVector(Op0.getNode()), 0), + getRC(MVT::i64)); } else if (i64vec.getOpcode() == SPUISD::SHUFB) { SDValue lhs = i64vec.getOperand(0); SDValue rhs = i64vec.getOperand(1); SDValue shufmask = i64vec.getOperand(2); - if (lhs.getOpcode() == ISD::BIT_CONVERT) { + if (lhs.getOpcode() == ISD::BITCAST) { ReplaceUses(lhs, lhs.getOperand(0)); lhs = lhs.getOperand(0); } @@ -1176,7 +1153,7 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT, ? lhs.getNode() : emitBuildVector(lhs.getNode())); - if (rhs.getOpcode() == ISD::BIT_CONVERT) { + if (rhs.getOpcode() == ISD::BITCAST) { ReplaceUses(rhs, rhs.getOperand(0)); rhs = rhs.getOperand(0); } @@ -1185,7 +1162,7 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT, ? rhs.getNode() : emitBuildVector(rhs.getNode())); - if (shufmask.getOpcode() == ISD::BIT_CONVERT) { + if (shufmask.getOpcode() == ISD::BITCAST) { ReplaceUses(shufmask, shufmask.getOperand(0)); shufmask = shufmask.getOperand(0); } @@ -1201,11 +1178,13 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT, HandleSDNode Dummy(shufNode); SDNode *SN = SelectCode(Dummy.getValue().getNode()); if (SN == 0) SN = Dummy.getValue().getNode(); - - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(SN, 0)); + + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + OpVT, SDValue(SN, 0), getRC(MVT::i64)); } else if (i64vec.getOpcode() == ISD::BUILD_VECTOR) { - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, - SDValue(emitBuildVector(i64vec.getNode()), 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, OpVT, + SDValue(emitBuildVector(i64vec.getNode()), 0), + getRC(MVT::i64)); } else { report_fatal_error("SPUDAGToDAGISel::SelectI64Constant: Unhandled i64vec" "condition"); diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 46f3189..e6511d0 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -1,4 +1,3 @@ -// //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===// // The LLVM Compiler Infrastructure // @@ -14,12 +13,13 @@ #include "SPURegisterNames.h" #include "SPUISelLowering.h" #include "SPUTargetMachine.h" -#include "SPUFrameInfo.h" +#include "SPUFrameLowering.h" #include "SPUMachineFunction.h" #include "llvm/Constants.h" #include "llvm/Function.h" #include "llvm/Intrinsics.h" #include "llvm/CallingConv.h" +#include "llvm/Type.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -41,41 +41,12 @@ using namespace llvm; namespace { std::map<unsigned, const char *> node_names; - //! EVT mapping to useful data for Cell SPU - struct valtype_map_s { - EVT valtype; - int prefslot_byte; - }; - - const valtype_map_s valtype_map[] = { - { MVT::i1, 3 }, - { MVT::i8, 3 }, - { MVT::i16, 2 }, - { MVT::i32, 0 }, - { MVT::f32, 0 }, - { MVT::i64, 0 }, - { MVT::f64, 0 }, - { MVT::i128, 0 } - }; - - const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]); - - const valtype_map_s *getValueTypeMapEntry(EVT VT) { - const valtype_map_s *retval = 0; - - for (size_t i = 0; i < n_valtype_map; ++i) { - if (valtype_map[i].valtype == VT) { - retval = valtype_map + i; - break; - } - } - -#ifndef NDEBUG - if (retval == 0) { - report_fatal_error("getValueTypeMapEntry returns NULL for " + - Twine(VT.getEVTString())); - } -#endif + // Byte offset of the preferred slot (counted from the MSB) + int prefslotOffset(EVT VT) { + int retval=0; + if (VT==MVT::i1) retval=3; + if (VT==MVT::i8) retval=3; + if (VT==MVT::i16) retval=2; return retval; } @@ -125,8 +96,6 @@ namespace { SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()), SPUTM(TM) { - // Fold away setcc operations if possible. - setPow2DivIsCheap(); // Use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(true); @@ -376,10 +345,10 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::BIT_CONVERT, MVT::i32, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::f32, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::i64, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::f64, Legal); + setOperationAction(ISD::BITCAST, MVT::i32, Legal); + setOperationAction(ISD::BITCAST, MVT::f32, Legal); + setOperationAction(ISD::BITCAST, MVT::i64, Legal); + setOperationAction(ISD::BITCAST, MVT::f64, Legal); // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -439,9 +408,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); - setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::SELECT, VT, Legal); - setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::STORE, VT, Custom); // These operations need to be expanded: setOperationAction(ISD::SDIV, VT, Expand); @@ -502,8 +471,8 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB"; node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC"; node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES"; + node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS"; + node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES"; node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL"; node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR"; node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT"; @@ -531,10 +500,20 @@ unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const { //===----------------------------------------------------------------------===// MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const { - // i16 and i32 are valid SETCC result types - return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? - VT.getSimpleVT().SimpleTy : - MVT::i32); + // i8, i16 and i32 are valid SETCC result types + MVT::SimpleValueType retval; + + switch(VT.getSimpleVT().SimpleTy){ + case MVT::i1: + case MVT::i8: + retval = MVT::i8; break; + case MVT::i16: + retval = MVT::i16; break; + case MVT::i32: + default: + retval = MVT::i32; + } + return retval; } //===----------------------------------------------------------------------===// @@ -572,113 +551,174 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT OutVT = Op.getValueType(); ISD::LoadExtType ExtType = LN->getExtensionType(); unsigned alignment = LN->getAlignment(); - const valtype_map_s *vtm = getValueTypeMapEntry(InVT); + int pso = prefslotOffset(InVT); DebugLoc dl = Op.getDebugLoc(); - - switch (LN->getAddressingMode()) { - case ISD::UNINDEXED: { - SDValue result; - SDValue basePtr = LN->getBasePtr(); - SDValue rotate; - - if (alignment == 16) { - ConstantSDNode *CN; - - // Special cases for a known aligned load to simplify the base pointer - // and the rotation amount: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte); - - if (rotamt < 0) - rotamt += 16; - - rotate = DAG.getConstant(rotamt, MVT::i16); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) - || (basePtr.getOpcode() == SPUISD::IndirectAddr - && basePtr.getOperand(0).getOpcode() == SPUISD::Hi - && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { - // Plain aligned a-form address: rotate into preferred slot - // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getConstant(rotamt, MVT::i16); - } else { - // Offset the rotate amount by the basePtr and the preferred slot - // byte offset - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getNode(ISD::ADD, dl, PtrVT, - basePtr, - DAG.getConstant(rotamt, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa<ConstantSDNode>(Op1)) { - // Convert the (add <ptr>, <const>) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add <arg1>, <arg2>) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT, + (128 / InVT.getSizeInBits())); + + // two sanity checks + assert( LN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (InVT.getSizeInBits() == 128 && (alignment%16) == 0) + return SDValue(); + + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = LN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16); + + SDValue result; + SDValue basePtr = LN->getBasePtr(); + SDValue rotate; + + if ((alignment%16) == 0) { + ConstantSDNode *CN; + + // Special cases for a known aligned load to simplify the base pointer + // and the rotation amount: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + int64_t rotamt = int64_t((offset & 0xf) - pso); + + if (rotamt < 0) + rotamt += 16; + + rotate = DAG.getConstant(rotamt, MVT::i16); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - + } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) + || (basePtr.getOpcode() == SPUISD::IndirectAddr + && basePtr.getOperand(0).getOpcode() == SPUISD::Hi + && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { + // Plain aligned a-form address: rotate into preferred slot + // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; + rotate = DAG.getConstant(rotamt, MVT::i16); + } else { // Offset the rotate amount by the basePtr and the preferred slot // byte offset + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; rotate = DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, - DAG.getConstant(-vtm->prefslot_byte, PtrVT)); + DAG.getConstant(rotamt, PtrVT)); } + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa<ConstantSDNode>(Op1)) { + // Convert the (add <ptr>, <const>) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add <arg1>, <arg2>) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } + + // Offset the rotate amount by the basePtr and the preferred slot + // byte offset + rotate = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(-pso, PtrVT)); + } - // Re-emit as a v16i8 vector load - result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), - LN->isVolatile(), LN->isNonTemporal(), 16); + // Do the load as a i128 to allow possible shifting + SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr, + lowMemPtr, + LN->isVolatile(), LN->isNonTemporal(), 16); + // When the size is not greater than alignment we get all data with just + // one load + if (alignment >= InVT.getSizeInBits()/8) { // Update the chain - the_chain = result.getValue(1); + the_chain = low.getValue(1); // Rotate into the preferred slot: - result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8, - result.getValue(0), rotate); + result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128, + low.getValue(0), rotate); // Convert the loaded v16i8 vector to the appropriate vector type // specified by the operand: - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), + EVT vecVT = EVT::getVectorVT(*DAG.getContext(), InVT, (128 / InVT.getSizeInBits())); result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, - DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result)); + DAG.getNode(ISD::BITCAST, dl, vecVT, result)); + } + // When alignment is less than the size, we might need (known only at + // run-time) two loads + // TODO: if the memory address is composed only from constants, we have + // extra kowledge, and might avoid the second load + else { + // storage position offset from lower 16 byte aligned memory chunk + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + basePtr, DAG.getConstant( 0xf, MVT::i32 ) ); + // get a registerfull of ones. (this implementation is a workaround: LLVM + // cannot handle 128 bit signed int constants) + SDValue ones = DAG.getConstant(-1, MVT::v4i32 ); + ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones); + + SDValue high = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(16, PtrVT)), + highMemPtr, + LN->isVolatile(), LN->isNonTemporal(), 16); + + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + high.getValue(1)); + + // Shift the (possible) high part right to compensate the misalignemnt. + // if there is no highpart (i.e. value is i64 and offset is 4), this + // will zero out the high value. + high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high, + DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset + )); + + // Shift the low similarily + // TODO: add SPUISD::SHL_BYTES + low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset ); + + // Merge the two parts + result = DAG.getNode(ISD::BITCAST, dl, vecVT, + DAG.getNode(ISD::OR, dl, MVT::i128, low, high)); + + if (!InVT.isVector()) { + result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result ); + } + } // Handle extending loads by extending the scalar result: if (ExtType == ISD::SEXTLOAD) { result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result); @@ -702,21 +742,6 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { result = DAG.getNode(SPUISD::LDRESULT, dl, retvts, retops, sizeof(retops) / sizeof(retops[0])); return result; - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)LN->getAddressingMode())); - /*NOTREACHED*/ - } - } - - return SDValue(); } /// Custom lower stores for CellSPU @@ -734,93 +759,103 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); DebugLoc dl = Op.getDebugLoc(); unsigned alignment = SN->getAlignment(); + SDValue result; + EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT, + (128 / StVT.getSizeInBits())); + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = SN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16); + + + // two sanity checks + assert( SN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (StVT.getSizeInBits() == 128 && (alignment%16) == 0) + return SDValue(); + + SDValue alignLoadVec; + SDValue basePtr = SN->getBasePtr(); + SDValue the_chain = SN->getChain(); + SDValue insertEltOffs; + + if ((alignment%16) == 0) { + ConstantSDNode *CN; + // Special cases for a known aligned load to simplify the base pointer + // and insertion byte: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant((offset & 0xf), PtrVT)); - switch (SN->getAddressingMode()) { - case ISD::UNINDEXED: { - // The vector type we really want to load from the 16-byte chunk. - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), - VT, (128 / VT.getSizeInBits())); - - SDValue alignLoadVec; - SDValue basePtr = SN->getBasePtr(); - SDValue the_chain = SN->getChain(); - SDValue insertEltOffs; - - if (alignment == 16) { - ConstantSDNode *CN; - // Special cases for a known aligned load to simplify the base pointer - // and insertion byte: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & 0xf), PtrVT)); - - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else { - // Otherwise, assume it's at byte 0 of basePtr - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant(0, PtrVT)); - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant(0, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa<ConstantSDNode>(Op1)) { - // Convert the (add <ptr>, <const>) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add <arg1>, <arg2>) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - - // Insertion point is solely determined by basePtr's contents - insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + } else { + // Otherwise, assume it's at byte 0 of basePtr + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, DAG.getConstant(0, PtrVT)); } + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa<ConstantSDNode>(Op1)) { + // Convert the (add <ptr>, <const>) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add <arg1>, <arg2>) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } - // Load the memory to which to store. - alignLoadVec = DAG.getLoad(vecVT, dl, the_chain, basePtr, - SN->getSrcValue(), SN->getSrcValueOffset(), - SN->isVolatile(), SN->isNonTemporal(), 16); + // Insertion point is solely determined by basePtr's contents + insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } + // Load the lower part of the memory to which to store. + SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr, + lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16); + + // if we don't need to store over the 16 byte boundary, one store suffices + if (alignment >= StVT.getSizeInBits()/8) { // Update the chain - the_chain = alignLoadVec.getValue(1); + the_chain = low.getValue(1); - LoadSDNode *LN = cast<LoadSDNode>(alignLoadVec); + LoadSDNode *LN = cast<LoadSDNode>(low); SDValue theValue = SN->getValue(); - SDValue result; if (StVT != VT && (theValue.getOpcode() == ISD::AssertZext @@ -844,48 +879,114 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, insertEltOffs); - SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, + SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, theValue); result = DAG.getNode(SPUISD::SHUFB, dl, vecVT, - vectorizeOp, alignLoadVec, - DAG.getNode(ISD::BIT_CONVERT, dl, + vectorizeOp, low, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, insertEltOp)); result = DAG.getStore(the_chain, dl, result, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), + lowMemPtr, LN->isVolatile(), LN->isNonTemporal(), - LN->getAlignment()); - -#if 0 && !defined(NDEBUG) - if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - const SDValue ¤tRoot = DAG.getRoot(); - - DAG.setRoot(result); - errs() << "------- CellSPU:LowerStore result:\n"; - DAG.dump(); - errs() << "-------\n"; - DAG.setRoot(currentRoot); - } -#endif - - return result; - /*UNREACHED*/ - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)SN->getAddressingMode())); - /*NOTREACHED*/ - } + 16); + + } + // do the store when it might cross the 16 byte memory access boundary. + else { + // TODO issue a warning if SN->isVolatile()== true? This is likely not + // what the user wanted. + + // address offset from nearest lower 16byte alinged address + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + SN->getBasePtr(), + DAG.getConstant(0xf, MVT::i32)); + // 16 - offset + SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset); + // 16 - sizeof(Value) + SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + DAG.getConstant( VT.getSizeInBits()/8, + MVT::i32)); + // get a registerfull of ones + SDValue ones = DAG.getConstant(-1, MVT::v4i32); + ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones); + + // Create the 128 bit masks that have ones where the data to store is + // located. + SDValue lowmask, himask; + // if the value to store don't fill up the an entire 128 bits, zero + // out the last bits of the mask so that only the value we want to store + // is masked. + // this is e.g. in the case of store i32, align 2 + if (!VT.isVector()){ + Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus); + lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + surplus); + Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value); + Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask); + + } + else { + lowmask = ones; + Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value); + } + // this will zero, if there are no data that goes to the high quad + himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + offset_compl); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask, + offset); + + // Load in the old data and zero out the parts that will be overwritten with + // the new data to store. + SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + hi.getValue(1)); + + low = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BITCAST, dl, MVT::i128, low), + DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones)); + hi = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi), + DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones)); + + // Shift the Value to store into place. rlow contains the parts that go to + // the lower memory chunk, rhi has the parts that go to the upper one. + SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset); + rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask); + SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value, + offset_compl); + + // Merge the old data and the new data and store the results + // Need to convert vectors here to integer as 'OR'ing floats assert + rlow = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BITCAST, dl, MVT::i128, low), + DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow)); + rhi = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi), + DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi)); + + low = DAG.getStore(the_chain, dl, rlow, basePtr, + lowMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + hi = DAG.getStore(the_chain, dl, rhi, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0), + hi.getValue(0)); } - return SDValue(); + return result; } //! Generate the address of a constant pool entry. @@ -993,7 +1094,7 @@ LowerConstantFP(SDValue Op, SelectionDAG &DAG) { SDValue T = DAG.getConstant(dbits, MVT::i64); SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T); return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Tvec)); + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec)); } return SDValue(); @@ -1013,9 +1114,9 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, MachineRegisterInfo &RegInfo = MF.getRegInfo(); SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>(); - unsigned ArgOffset = SPUFrameInfo::minStackSize(); + unsigned ArgOffset = SPUFrameLowering::minStackSize(); unsigned ArgRegIdx = 0; - unsigned StackSlotSize = SPUFrameInfo::stackSlotSize(); + unsigned StackSlotSize = SPUFrameLowering::stackSlotSize(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); @@ -1080,7 +1181,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, // or we're forced to do vararg int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0, false, false, 0); + ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), + false, false, 0); ArgOffset += StackSlotSize; } @@ -1091,8 +1193,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, // vararg handling: if (isVarArg) { - // FIXME: we should be able to query the argument registers from - // tablegen generated code. + // FIXME: we should be able to query the argument registers from + // tablegen generated code. static const unsigned ArgRegs[] = { SPU::R3, SPU::R4, SPU::R5, SPU::R6, SPU::R7, SPU::R8, SPU::R9, SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16, @@ -1117,9 +1219,9 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setVarArgsFrameIndex( MFI->CreateFixedObject(StackSlotSize, ArgOffset, true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); - unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass); + unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass, dl); SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8); - SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0, + SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(), false, false, 0); Chain = Store.getOperand(0); MemOps.push_back(Store); @@ -1163,14 +1265,14 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, const SPUSubtarget *ST = SPUTM.getSubtargetImpl(); unsigned NumOps = Outs.size(); - unsigned StackSlotSize = SPUFrameInfo::stackSlotSize(); + unsigned StackSlotSize = SPUFrameLowering::stackSlotSize(); SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); + *DAG.getContext()); // FIXME: allow for other calling conventions CCInfo.AnalyzeCallOperands(Outs, CCC_SPU); - + const unsigned NumArgRegs = ArgLocs.size(); @@ -1184,7 +1286,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Figure out which arguments are going to go in registers, and which in // memory. - unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR] + unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR] unsigned ArgRegIdx = 0; // Keep track of registers passing arguments @@ -1219,7 +1321,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (ArgRegIdx != NumArgRegs) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0, + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0)); ArgOffset += StackSlotSize; } @@ -1230,7 +1333,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Accumulate how many bytes are to be pushed on the stack, including the // linkage area, and parameter passing area. According to the SPU ABI, // we minimally need space for [LR] and [SP]. - unsigned NumStackBytes = ArgOffset - SPUFrameInfo::minStackSize(); + unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize(); // Insert a call sequence start Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes, @@ -1311,7 +1414,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (InFlag.getNode()) Ops.push_back(InFlag); // Returns a chain and a flag for retval copy to use. - Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag), + Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue), &Ops[0], Ops.size()); InFlag = Chain.getValue(1); @@ -1334,7 +1437,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // If the call has results, copy the values out of the ret val registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; - + SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); @@ -1567,7 +1670,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { && "LowerBUILD_VECTOR: Unexpected floating point vector element."); // NOTE: pretend the constant is an integer. LLVM won't load FP constants SDValue T = DAG.getConstant(Value32, MVT::i32); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, + return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T)); break; } @@ -1577,7 +1680,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes."); // NOTE: pretend the constant is an integer. LLVM won't load FP constants SDValue T = DAG.getConstant(f64val, MVT::i64); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, + return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T)); break; } @@ -1587,7 +1690,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { SmallVector<SDValue, 8> Ops; Ops.assign(8, DAG.getConstant(Value16, MVT::i16)); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size())); } case MVT::v8i16: { @@ -1621,7 +1724,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, if (upper == lower) { // Magic constant that can be matched by IL, ILA, et. al. SDValue Val = DAG.getTargetConstant(upper, MVT::i32); - return DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + return DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Val, Val, Val, Val)); } else { @@ -1650,7 +1753,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, // Create lower vector if not a special pattern if (!lower_special) { SDValue LO32C = DAG.getConstant(lower, MVT::i32); - LO32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, LO32C, LO32C, LO32C, LO32C)); } @@ -1658,7 +1761,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, // Create upper vector if not a special pattern if (!upper_special) { SDValue HI32C = DAG.getConstant(upper, MVT::i32); - HI32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, HI32C, HI32C, HI32C, HI32C)); } @@ -1735,14 +1838,14 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { unsigned CurrElt = 0; unsigned MaxElts = VecVT.getVectorNumElements(); unsigned PrevElt = 0; - unsigned V0Elt = 0; bool monotonic = true; bool rotate = true; + int rotamt=0; EVT maskVT; // which of the c?d instructions to use if (EltVT == MVT::i8) { V2EltIdx0 = 16; - maskVT = MVT::v16i8; + maskVT = MVT::v16i8; } else if (EltVT == MVT::i16) { V2EltIdx0 = 8; maskVT = MVT::v8i16; @@ -1758,7 +1861,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { for (unsigned i = 0; i != MaxElts; ++i) { if (SVN->getMaskElt(i) < 0) continue; - + unsigned SrcElt = SVN->getMaskElt(i); if (monotonic) { @@ -1782,13 +1885,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if ((PrevElt == SrcElt - 1) || (PrevElt == MaxElts - 1 && SrcElt == 0)) { PrevElt = SrcElt; - if (SrcElt == 0) - V0Elt = i; } else { rotate = false; } - } else if (i == 0) { - // First time through, need to keep track of previous element + } else if (i == 0 || (PrevElt==0 && SrcElt==1)) { + // First time or after a "wrap around" + rotamt = SrcElt-i; PrevElt = SrcElt; } else { // This isn't a rotation, takes elements from vector 2 @@ -1806,15 +1908,16 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, DAG.getRegister(SPU::R1, PtrVT), DAG.getConstant(V2EltOffset, MVT::i32)); - SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, + SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer); // Use shuffle mask in SHUFB synthetic instruction: return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1, ShufMaskOp); } else if (rotate) { - int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8; - + if (rotamt < 0) + rotamt +=MaxElts; + rotamt *= EltVT.getSizeInBits()/8; return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(), V1, DAG.getConstant(rotamt, MVT::i16)); } else { @@ -1999,7 +2102,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(scaleShift, MVT::i32)); } - vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt); + vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt); // Replicate the bytes starting at byte 0 across the entire vector (for // consistency with the notion of a unified register set) @@ -2069,7 +2172,7 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getRegister(SPU::R1, PtrVT), DAG.getConstant(Offset, PtrVT)); // widen the mask when dealing with half vectors - EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(), + EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(), 128/ VT.getVectorElementType().getSizeInBits()); SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer); @@ -2077,7 +2180,7 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getNode(SPUISD::SHUFB, dl, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp), VecOp, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, ShufMask)); + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask)); return result; } @@ -2197,12 +2300,12 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) { ConstVec = Op.getOperand(0); Arg = Op.getOperand(1); if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) { - if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) { + if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) { ConstVec = ConstVec.getOperand(0); } else { ConstVec = Op.getOperand(1); Arg = Op.getOperand(0); - if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) { + if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) { ConstVec = ConstVec.getOperand(0); } } @@ -2243,7 +2346,7 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) { */ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), + EVT vecVT = EVT::getVectorVT(*DAG.getContext(), VT, (128 / VT.getSizeInBits())); DebugLoc dl = Op.getDebugLoc(); @@ -2419,7 +2522,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG, // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently // selected to a NOP: - SDValue i64lhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, lhs); + SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs); SDValue lhsHi32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRL, dl, IntVT, @@ -2453,7 +2556,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG, ISD::SETGT)); } - SDValue i64rhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, rhs); + SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs); SDValue rhsHi32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRL, dl, IntVT, @@ -2567,7 +2670,7 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) // Type to truncate to EVT VT = Op.getValueType(); MVT simpleVT = VT.getSimpleVT(); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, (128 / VT.getSizeInBits())); DebugLoc dl = Op.getDebugLoc(); @@ -2575,7 +2678,7 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) SDValue Op0 = Op.getOperand(0); EVT Op0VT = Op0.getValueType(); - if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) { + if (Op0VT == MVT::i128 && simpleVT == MVT::i64) { // Create shuffle mask, least significant doubleword of quadword unsigned maskHigh = 0x08090a0b; unsigned maskLow = 0x0c0d0e0f; @@ -2616,6 +2719,12 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) SDValue Op0 = Op.getOperand(0); MVT Op0VT = Op0.getValueType().getSimpleVT(); + // extend i8 & i16 via i32 + if (Op0VT == MVT::i8 || Op0VT == MVT::i16) { + Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0); + Op0VT = MVT::i32; + } + // The type to extend to needs to be a i128 and // the type to extend from needs to be i64 or i32. assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) && @@ -2640,12 +2749,17 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0), DAG.getConstant(31, MVT::i32)); + // reinterpret as a i128 (SHUFB requires it). This gets lowered away. + SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, Op0VT, Op0, + DAG.getTargetConstant( + SPU::GPRCRegClass.getID(), + MVT::i32)), 0); // Shuffle bytes - Copy the sign bits into the upper 64 bits // and the input value into the lower 64 bits. SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt, - DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i128, Op0), sraVal, shufMask); - - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, extShuffle); + extended, sraVal, shufMask); + return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle); } //! Custom (target-specific) lowering entry point @@ -2903,8 +3017,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const } break; } - case SPUISD::SHLQUAD_L_BITS: - case SPUISD::SHLQUAD_L_BYTES: + case SPUISD::SHL_BITS: + case SPUISD::SHL_BYTES: case SPUISD::ROTBYTES_LEFT: { SDValue Op1 = N->getOperand(1); @@ -2982,6 +3096,38 @@ SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const return TargetLowering::getConstraintType(ConstraintLetter); } +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +SPUTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break;
+ //FIXME: Seems like the supported constraint letters were just copied + // from PPC, as the following doesn't correspond to the GCC docs. + // I'm leaving it so until someone adds the corresponding lowering support. + case 'b': + case 'r': + case 'f': + case 'd': + case 'v': + case 'y': + weight = CW_Register; + break; + } + return weight; +} + std::pair<unsigned, const TargetRegisterClass*> SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const @@ -3086,3 +3232,28 @@ SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The SPU target isn't yet aware of offsets. return false; } + +// can we compare to Imm without writing it into a register? +bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + //ceqi, cgti, etc. all take s10 operand + return isInt<10>(Imm); +} + +bool +SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM, + const Type * ) const{ + + // A-form: 18bit absolute address. + if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0) + return true; + + // D-form: reg + 14bit offset + if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs)) + return true; + + // X-form: reg+reg + if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0) + return true; + + return false; +} diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h index 6d3c90b..95d44af 100644 --- a/lib/Target/CellSPU/SPUISelLowering.h +++ b/lib/Target/CellSPU/SPUISelLowering.h @@ -41,8 +41,9 @@ namespace llvm { CNTB, ///< Count leading ones in bytes PREFSLOT2VEC, ///< Promote scalar->vector VEC2PREFSLOT, ///< Extract element 0 - SHLQUAD_L_BITS, ///< Rotate quad left, by bits - SHLQUAD_L_BYTES, ///< Rotate quad left, by bytes + SHL_BITS, ///< Shift quad left, by bits + SHL_BYTES, ///< Shift quad left, by bytes + SRL_BYTES, ///< Shift quad right, by bytes. Insert zeros. VEC_ROTL, ///< Vector rotate left VEC_ROTR, ///< Vector rotate right ROTBYTES_LEFT, ///< Rotate bytes (loads -> ROTQBYI) @@ -129,6 +130,11 @@ namespace llvm { ConstraintType getConstraintType(const std::string &ConstraintLetter) const; + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const; + std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; @@ -170,6 +176,19 @@ namespace llvm { const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, SelectionDAG &DAG) const; + + virtual bool isLegalICmpImmediate(int64_t Imm) const; + + virtual bool isLegalAddressingMode(const AddrMode &AM, + const Type *Ty) const; + + /// After allocating this many registers, the allocator should feel + /// register pressure. The value is a somewhat random guess, based on the + /// number of non callee saved registers in the C calling convention. + virtual unsigned getRegPressureLimit( const TargetRegisterClass *RC, + MachineFunction &MF) const{ + return 50; + } }; } diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp index 26d6b4f..f9e6c72 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.cpp +++ b/lib/Target/CellSPU/SPUInstrInfo.cpp @@ -16,6 +16,7 @@ #include "SPUInstrBuilder.h" #include "SPUTargetMachine.h" #include "SPUGenInstrInfo.inc" +#include "SPUHazardRecognizers.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -54,6 +55,16 @@ SPUInstrInfo::SPUInstrInfo(SPUTargetMachine &tm) RI(*TM.getSubtargetImpl(), *this) { /* NOP */ } +/// CreateTargetHazardRecognizer - Return the hazard recognizer to use for +/// this target when scheduling the DAG. +ScheduleHazardRecognizer *SPUInstrInfo::CreateTargetHazardRecognizer( + const TargetMachine *TM, + const ScheduleDAG *DAG) const { + const TargetInstrInfo *TII = TM->getInstrInfo(); + assert(TII && "No InstrInfo?"); + return new SPUHazardRecognizer(*TII); +} + unsigned SPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const { @@ -129,7 +140,7 @@ SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { unsigned opc; - bool isValidFrameIdx = (FrameIdx < SPUFrameInfo::maxFrameOffset()); + bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset()); if (RC == SPU::GPRCRegisterClass) { opc = (isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128); } else if (RC == SPU::R64CRegisterClass) { @@ -164,7 +175,7 @@ SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { unsigned opc; - bool isValidFrameIdx = (FrameIdx < SPUFrameInfo::maxFrameOffset()); + bool isValidFrameIdx = (FrameIdx < SPUFrameLowering::maxFrameOffset()); if (RC == SPU::GPRCRegisterClass) { opc = (isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128); } else if (RC == SPU::R64CRegisterClass) { diff --git a/lib/Target/CellSPU/SPUInstrInfo.h b/lib/Target/CellSPU/SPUInstrInfo.h index 191e55d..e5e9148 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.h +++ b/lib/Target/CellSPU/SPUInstrInfo.h @@ -32,6 +32,10 @@ namespace llvm { /// virtual const SPURegisterInfo &getRegisterInfo() const { return RI; } + ScheduleHazardRecognizer * + CreateTargetHazardRecognizer(const TargetMachine *TM, + const ScheduleDAG *DAG) const; + unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; unsigned isStoreToStackSlot(const MachineInstr *MI, diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index ca0fe00..25f6fd0 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -416,7 +416,7 @@ multiclass ImmLoadAddress def lo: ILARegInst<R32C, symbolLo, imm18>; def lsa: ILAInst<(outs R32C:$rT), (ins symbolLSA:$val), - [/* no pattern */]>; + [(set R32C:$rT, imm18:$val)]>; } defm ILA : ImmLoadAddress; @@ -1167,10 +1167,10 @@ class XSHWRegInst<RegisterClass rclass>: [(set rclass:$rDest, (sext R16C:$rSrc))]>; multiclass ExtendHalfwordWord { - def v4i32: XSHWVecInst<v4i32, v8i16>; - + def v4i32: XSHWVecInst<v8i16, v4i32>; + def r16: XSHWRegInst<R32C>; - + def r32: XSHWInRegInst<R32C, [(set R32C:$rDest, (sext_inreg R32C:$rSrc, i16))]>; def r64: XSHWInRegInst<R64C, [/* no pattern */]>; @@ -1385,59 +1385,6 @@ class ORRegInst<RegisterClass rclass>: ORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB), [(set rclass:$rT, (or rclass:$rA, rclass:$rB))]>; -// ORCvtForm: OR conversion form -// -// This is used to "convert" the preferred slot to its vector equivalent, as -// well as convert a vector back to its preferred slot. -// -// These are effectively no-ops, but need to exist for proper type conversion -// and type coercion. - -class ORCvtForm<dag OOL, dag IOL, list<dag> pattern = [/* no pattern */]> - : SPUInstr<OOL, IOL, "or\t$rT, $rA, $rA", IntegerOp> { - bits<7> RA; - bits<7> RT; - - let Pattern = pattern; - - let Inst{0-10} = 0b10000010000; - let Inst{11-17} = RA; - let Inst{18-24} = RA; - let Inst{25-31} = RT; -} - -class ORPromoteScalar<RegisterClass rclass>: - ORCvtForm<(outs VECREG:$rT), (ins rclass:$rA)>; - -class ORExtractElt<RegisterClass rclass>: - ORCvtForm<(outs rclass:$rT), (ins VECREG:$rA)>; - -/* class ORCvtRegGPRC<RegisterClass rclass>: - ORCvtForm<(outs GPRC:$rT), (ins rclass:$rA)>; */ - -/* class ORCvtGPRCReg<RegisterClass rclass>: - ORCvtForm<(outs rclass:$rT), (ins GPRC:$rA)>; */ - -class ORCvtFormR32Reg<RegisterClass rclass, list<dag> pattern = [ ]>: - ORCvtForm<(outs rclass:$rT), (ins R32C:$rA), pattern>; - -class ORCvtFormRegR32<RegisterClass rclass, list<dag> pattern = [ ]>: - ORCvtForm<(outs R32C:$rT), (ins rclass:$rA), pattern>; - -class ORCvtFormR64Reg<RegisterClass rclass, list<dag> pattern = [ ]>: - ORCvtForm<(outs rclass:$rT), (ins R64C:$rA), pattern>; - -class ORCvtFormRegR64<RegisterClass rclass, list<dag> pattern = [ ]>: - ORCvtForm<(outs R64C:$rT), (ins rclass:$rA), pattern>; - -class ORCvtGPRCVec: - ORCvtForm<(outs VECREG:$rT), (ins GPRC:$rA)>; - -class ORCvtVecGPRC: - ORCvtForm<(outs GPRC:$rT), (ins VECREG:$rA)>; - -class ORCvtVecVec: - ORCvtForm<(outs VECREG:$rT), (ins VECREG:$rA)>; multiclass BitwiseOr { @@ -1468,119 +1415,48 @@ multiclass BitwiseOr def f64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB), [/* no pattern */]>; - - // scalar->vector promotion, prefslot2vec: - def v16i8_i8: ORPromoteScalar<R8C>; - def v8i16_i16: ORPromoteScalar<R16C>; - def v4i32_i32: ORPromoteScalar<R32C>; - def v2i64_i64: ORPromoteScalar<R64C>; - def v4f32_f32: ORPromoteScalar<R32FP>; - def v2f64_f64: ORPromoteScalar<R64FP>; - - // vector->scalar demotion, vec2prefslot: - def i8_v16i8: ORExtractElt<R8C>; - def i16_v8i16: ORExtractElt<R16C>; - def i32_v4i32: ORExtractElt<R32C>; - def i64_v2i64: ORExtractElt<R64C>; - def f32_v4f32: ORExtractElt<R32FP>; - def f64_v2f64: ORExtractElt<R64FP>; - - // Conversion from vector to GPRC - def i128_vec: ORCvtVecGPRC; - - // Conversion from GPRC to vector - def vec_i128: ORCvtGPRCVec; - -/* - // Conversion from register to GPRC - def i128_r64: ORCvtRegGPRC<R64C>; - def i128_f64: ORCvtRegGPRC<R64FP>; - def i128_r32: ORCvtRegGPRC<R32C>; - def i128_f32: ORCvtRegGPRC<R32FP>; - def i128_r16: ORCvtRegGPRC<R16C>; - def i128_r8: ORCvtRegGPRC<R8C>; - - // Conversion from GPRC to register - def r64_i128: ORCvtGPRCReg<R64C>; - def f64_i128: ORCvtGPRCReg<R64FP>; - def r32_i128: ORCvtGPRCReg<R32C>; - def f32_i128: ORCvtGPRCReg<R32FP>; - def r16_i128: ORCvtGPRCReg<R16C>; - def r8_i128: ORCvtGPRCReg<R8C>; -*/ -/* - // Conversion from register to R32C: - def r32_r16: ORCvtFormRegR32<R16C>; - def r32_r8: ORCvtFormRegR32<R8C>; - - // Conversion from R32C to register - def r32_r16: ORCvtFormR32Reg<R16C>; - def r32_r8: ORCvtFormR32Reg<R8C>; -*/ - - // Conversion from R64C to register: - def r32_r64: ORCvtFormR64Reg<R32C>; - // def r16_r64: ORCvtFormR64Reg<R16C>; - // def r8_r64: ORCvtFormR64Reg<R8C>; - - // Conversion to R64C from register: - def r64_r32: ORCvtFormRegR64<R32C>; - // def r64_r16: ORCvtFormRegR64<R16C>; - // def r64_r8: ORCvtFormRegR64<R8C>; - - // bitconvert patterns: - def r32_f32: ORCvtFormR32Reg<R32FP, - [(set R32FP:$rT, (bitconvert R32C:$rA))]>; - def f32_r32: ORCvtFormRegR32<R32FP, - [(set R32C:$rT, (bitconvert R32FP:$rA))]>; - - def r64_f64: ORCvtFormR64Reg<R64FP, - [(set R64FP:$rT, (bitconvert R64C:$rA))]>; - def f64_r64: ORCvtFormRegR64<R64FP, - [(set R64C:$rT, (bitconvert R64FP:$rA))]>; } defm OR : BitwiseOr; -// scalar->vector promotion patterns (preferred slot to vector): +//===----------------------------------------------------------------------===// +// SPU::PREFSLOT2VEC and VEC2PREFSLOT re-interpretations of registers +//===----------------------------------------------------------------------===// def : Pat<(v16i8 (SPUprefslot2vec R8C:$rA)), - (ORv16i8_i8 R8C:$rA)>; + (COPY_TO_REGCLASS R8C:$rA, VECREG)>; def : Pat<(v8i16 (SPUprefslot2vec R16C:$rA)), - (ORv8i16_i16 R16C:$rA)>; + (COPY_TO_REGCLASS R16C:$rA, VECREG)>; def : Pat<(v4i32 (SPUprefslot2vec R32C:$rA)), - (ORv4i32_i32 R32C:$rA)>; + (COPY_TO_REGCLASS R32C:$rA, VECREG)>; def : Pat<(v2i64 (SPUprefslot2vec R64C:$rA)), - (ORv2i64_i64 R64C:$rA)>; + (COPY_TO_REGCLASS R64C:$rA, VECREG)>; def : Pat<(v4f32 (SPUprefslot2vec R32FP:$rA)), - (ORv4f32_f32 R32FP:$rA)>; + (COPY_TO_REGCLASS R32FP:$rA, VECREG)>; def : Pat<(v2f64 (SPUprefslot2vec R64FP:$rA)), - (ORv2f64_f64 R64FP:$rA)>; - -// ORi*_v*: Used to extract vector element 0 (the preferred slot), otherwise -// known as converting the vector back to its preferred slot - -def : Pat<(SPUvec2prefslot (v16i8 VECREG:$rA)), - (ORi8_v16i8 VECREG:$rA)>; + (COPY_TO_REGCLASS R64FP:$rA, VECREG)>; + +def : Pat<(i8 (SPUvec2prefslot (v16i8 VECREG:$rA))), + (COPY_TO_REGCLASS (v16i8 VECREG:$rA), R8C)>; -def : Pat<(SPUvec2prefslot (v8i16 VECREG:$rA)), - (ORi16_v8i16 VECREG:$rA)>; +def : Pat<(i16 (SPUvec2prefslot (v8i16 VECREG:$rA))), + (COPY_TO_REGCLASS (v8i16 VECREG:$rA), R16C)>; -def : Pat<(SPUvec2prefslot (v4i32 VECREG:$rA)), - (ORi32_v4i32 VECREG:$rA)>; +def : Pat<(i32 (SPUvec2prefslot (v4i32 VECREG:$rA))), + (COPY_TO_REGCLASS (v4i32 VECREG:$rA), R32C)>; -def : Pat<(SPUvec2prefslot (v2i64 VECREG:$rA)), - (ORi64_v2i64 VECREG:$rA)>; +def : Pat<(i64 (SPUvec2prefslot (v2i64 VECREG:$rA))), + (COPY_TO_REGCLASS (v2i64 VECREG:$rA), R64C)>; -def : Pat<(SPUvec2prefslot (v4f32 VECREG:$rA)), - (ORf32_v4f32 VECREG:$rA)>; +def : Pat<(f32 (SPUvec2prefslot (v4f32 VECREG:$rA))), + (COPY_TO_REGCLASS (v4f32 VECREG:$rA), R32FP)>; -def : Pat<(SPUvec2prefslot (v2f64 VECREG:$rA)), - (ORf64_v2f64 VECREG:$rA)>; +def : Pat<(f64 (SPUvec2prefslot (v2f64 VECREG:$rA))), + (COPY_TO_REGCLASS (v2f64 VECREG:$rA), R64FP)>; // Load Register: This is an assembler alias for a bitwise OR of a register // against itself. It's here because it brings some clarity to assembly @@ -2093,7 +1969,7 @@ defm EQV: BitEquivalence; class SHUFBInst<dag OOL, dag IOL, list<dag> pattern>: RRRForm<0b1000, OOL, IOL, "shufb\t$rT, $rA, $rB, $rC", - IntegerOp, pattern>; + ShuffleOp, pattern>; class SHUFBVecInst<ValueType resultvec, ValueType maskvec>: SHUFBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC), @@ -2134,7 +2010,7 @@ defm SHUFB : ShuffleBytes; class SHLHInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b11111010000, OOL, IOL, "shlh\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftVec, pattern>; class SHLHVecInst<ValueType vectype>: SHLHInst<(outs VECREG:$rT), (ins VECREG:$rA, R16C:$rB), @@ -2156,7 +2032,7 @@ defm SHLH : ShiftLeftHalfword; class SHLHIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b11111010000, OOL, IOL, "shlhi\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftVec, pattern>; class SHLHIVecInst<ValueType vectype>: SHLHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val), @@ -2182,7 +2058,7 @@ def : Pat<(shl R16C:$rA, (i32 uimm7:$val)), class SHLInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b11111010000, OOL, IOL, "shl\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftVec, pattern>; multiclass ShiftLeftWord { @@ -2201,7 +2077,7 @@ defm SHL: ShiftLeftWord; class SHLIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b11111010000, OOL, IOL, "shli\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftVec, pattern>; multiclass ShiftLeftWordImm { @@ -2230,7 +2106,7 @@ defm SHLI : ShiftLeftWordImm; class SHLQBIInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b11011011100, OOL, IOL, "shlqbi\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftQuad, pattern>; class SHLQBIVecInst<ValueType vectype>: SHLQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2259,7 +2135,7 @@ defm SHLQBI : ShiftLeftQuadByBits; // enforcement, whereas with SHLQBI, we have to "take it on faith." class SHLQBIIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b11011111100, OOL, IOL, "shlqbii\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftQuad, pattern>; class SHLQBIIVecInst<ValueType vectype>: SHLQBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val), @@ -2283,7 +2159,7 @@ defm SHLQBII : ShiftLeftQuadByBitsImm; class SHLQBYInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b11111011100, OOL, IOL, "shlqby\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftQuad, pattern>; class SHLQBYVecInst<ValueType vectype>: SHLQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2306,7 +2182,7 @@ defm SHLQBY: ShiftLeftQuadBytes; class SHLQBYIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b11111111100, OOL, IOL, "shlqbyi\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftQuad, pattern>; class SHLQBYIVecInst<ValueType vectype>: SHLQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val), @@ -2330,7 +2206,7 @@ defm SHLQBYI : ShiftLeftQuadBytesImm; class SHLQBYBIInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b00111001111, OOL, IOL, "shlqbybi\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftQuad, pattern>; class SHLQBYBIVecInst<ValueType vectype>: SHLQBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2359,7 +2235,7 @@ defm SHLQBYBI : ShiftLeftQuadBytesBitCount; //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ class ROTHInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b00111010000, OOL, IOL, "roth\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftVec, pattern>; class ROTHVecInst<ValueType vectype>: ROTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), @@ -2386,7 +2262,7 @@ def ROTHr16_r32: ROTHInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB), //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ class ROTHIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b00111110000, OOL, IOL, "rothi\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftVec, pattern>; class ROTHIVecInst<ValueType vectype>: ROTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val), @@ -2413,7 +2289,7 @@ def : Pat<(SPUvec_rotl (v8i16 VECREG:$rA), (i32 uimm7:$val)), class ROTInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b00011010000, OOL, IOL, "rot\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftVec, pattern>; class ROTVecInst<ValueType vectype>: ROTInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2461,7 +2337,7 @@ def : Pat<(rotl R32C:$rA, (i32 (sext R8C:$rB))), class ROTIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b00011110000, OOL, IOL, "roti\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftVec, pattern>; class ROTIVecInst<ValueType vectype, Operand optype, ValueType inttype, PatLeaf pred>: ROTIInst<(outs VECREG:$rT), (ins VECREG:$rA, optype:$val), @@ -2491,12 +2367,15 @@ defm ROTI : RotateLeftWordImm; class ROTQBYInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b00111011100, OOL, IOL, "rotqby\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftQuad, pattern>; -class ROTQBYVecInst<ValueType vectype>: - ROTQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), - [(set (vectype VECREG:$rT), - (SPUrotbytes_left (vectype VECREG:$rA), R32C:$rB))]>; +class ROTQBYGenInst<ValueType type, RegisterClass rc>: + ROTQBYInst<(outs rc:$rT), (ins rc:$rA, R32C:$rB), + [(set (type rc:$rT), + (SPUrotbytes_left (type rc:$rA), R32C:$rB))]>; + +class ROTQBYVecInst<ValueType type>: + ROTQBYGenInst<type, VECREG>; multiclass RotateQuadLeftByBytes { @@ -2506,6 +2385,7 @@ multiclass RotateQuadLeftByBytes def v4f32: ROTQBYVecInst<v4f32>; def v2i64: ROTQBYVecInst<v2i64>; def v2f64: ROTQBYVecInst<v2f64>; + def i128: ROTQBYGenInst<i128, GPRC>; } defm ROTQBY: RotateQuadLeftByBytes; @@ -2516,12 +2396,15 @@ defm ROTQBY: RotateQuadLeftByBytes; class ROTQBYIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b00111111100, OOL, IOL, "rotqbyi\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftQuad, pattern>; + +class ROTQBYIGenInst<ValueType type, RegisterClass rclass>: + ROTQBYIInst<(outs rclass:$rT), (ins rclass:$rA, u7imm:$val), + [(set (type rclass:$rT), + (SPUrotbytes_left (type rclass:$rA), (i16 uimm7:$val)))]>; class ROTQBYIVecInst<ValueType vectype>: - ROTQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val), - [(set (vectype VECREG:$rT), - (SPUrotbytes_left (vectype VECREG:$rA), (i16 uimm7:$val)))]>; + ROTQBYIGenInst<vectype, VECREG>; multiclass RotateQuadByBytesImm { @@ -2531,6 +2414,7 @@ multiclass RotateQuadByBytesImm def v4f32: ROTQBYIVecInst<v4f32>; def v2i64: ROTQBYIVecInst<v2i64>; def vfi64: ROTQBYIVecInst<v2f64>; + def i128: ROTQBYIGenInst<i128, GPRC>; } defm ROTQBYI: RotateQuadByBytesImm; @@ -2539,7 +2423,7 @@ defm ROTQBYI: RotateQuadByBytesImm; class ROTQBYBIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b00110011100, OOL, IOL, "rotqbybi\t$rT, $rA, $shift", - RotateShift, pattern>; + RotShiftQuad, pattern>; class ROTQBYBIVecInst<ValueType vectype, RegisterClass rclass>: ROTQBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, rclass:$shift), @@ -2564,7 +2448,7 @@ defm ROTQBYBI : RotateQuadByBytesByBitshift; class ROTQBIInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b00011011100, OOL, IOL, "rotqbi\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftQuad, pattern>; class ROTQBIVecInst<ValueType vectype>: ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2589,7 +2473,7 @@ defm ROTQBI: RotateQuadByBitCount; class ROTQBIIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b00011111100, OOL, IOL, "rotqbii\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftQuad, pattern>; class ROTQBIIVecInst<ValueType vectype, Operand optype, ValueType inttype, PatLeaf pred>: @@ -2624,7 +2508,7 @@ defm ROTQBII : RotateQuadByBitCountImm; class ROTHMInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b10111010000, OOL, IOL, "rothm\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftVec, pattern>; def ROTHMv8i16: ROTHMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2666,7 +2550,7 @@ def : Pat<(srl R16C:$rA, R8C:$rB), class ROTHMIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b10111110000, OOL, IOL, "rothmi\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftVec, pattern>; def ROTHMIv8i16: ROTHMIInst<(outs VECREG:$rT), (ins VECREG:$rA, rothNeg7imm:$val), @@ -2697,7 +2581,7 @@ def: Pat<(srl R16C:$rA, (i8 uimm7:$val)), // ROTM v4i32 form: See the ROTHM v8i16 comments. class ROTMInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b10011010000, OOL, IOL, "rotm\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftVec, pattern>; def ROTMv4i32: ROTMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2732,7 +2616,7 @@ def : Pat<(srl R32C:$rA, R8C:$rB), // ROTMI v4i32 form: See the comment for ROTHM v8i16. def ROTMIv4i32: RI7Form<0b10011110000, (outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val), - "rotmi\t$rT, $rA, $val", RotateShift, + "rotmi\t$rT, $rA, $val", RotShiftVec, [(set (v4i32 VECREG:$rT), (SPUvec_srl VECREG:$rA, (i32 uimm7:$val)))]>; @@ -2745,7 +2629,7 @@ def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), (i8 uimm7:$val)), // ROTMI r32 form: know how to complement the immediate value. def ROTMIr32: RI7Form<0b10011110000, (outs R32C:$rT), (ins R32C:$rA, rotNeg7imm:$val), - "rotmi\t$rT, $rA, $val", RotateShift, + "rotmi\t$rT, $rA, $val", RotShiftVec, [(set R32C:$rT, (srl R32C:$rA, (i32 uimm7:$val)))]>; def : Pat<(srl R32C:$rA, (i16 imm:$val)), @@ -2762,7 +2646,7 @@ def : Pat<(srl R32C:$rA, (i8 imm:$val)), class ROTQMBYInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b10111011100, OOL, IOL, "rotqmby\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftQuad, pattern>; class ROTQMBYVecInst<ValueType vectype>: ROTQMBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2785,9 +2669,13 @@ multiclass RotateQuadBytes defm ROTQMBY : RotateQuadBytes; +def : Pat<(SPUsrl_bytes GPRC:$rA, R32C:$rB), + (ROTQMBYr128 GPRC:$rA, + (SFIr32 R32C:$rB, 0))>; + class ROTQMBYIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftQuad, pattern>; class ROTQMBYIVecInst<ValueType vectype>: ROTQMBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val), @@ -2827,7 +2715,7 @@ defm ROTQMBYI : RotateQuadBytesImm; class ROTQMBYBIInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b10110011100, OOL, IOL, "rotqmbybi\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftQuad, pattern>; class ROTQMBYBIVecInst<ValueType vectype>: ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2839,6 +2727,8 @@ multiclass RotateMaskQuadByBitCount def v8i16: ROTQMBYBIVecInst<v8i16>; def v4i32: ROTQMBYBIVecInst<v4i32>; def v2i64: ROTQMBYBIVecInst<v2i64>; + def r128: ROTQMBYBIInst<(outs GPRC:$rT), (ins GPRC:$rA, R32C:$rB), + [/*no pattern*/]>; } defm ROTQMBYBI: RotateMaskQuadByBitCount; @@ -2850,7 +2740,7 @@ defm ROTQMBYBI: RotateMaskQuadByBitCount; class ROTQMBIInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b10011011100, OOL, IOL, "rotqmbi\t$rT, $rA, $rB", - RotateShift, pattern>; + RotShiftQuad, pattern>; class ROTQMBIVecInst<ValueType vectype>: ROTQMBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), @@ -2873,13 +2763,19 @@ multiclass RotateMaskQuadByBits defm ROTQMBI: RotateMaskQuadByBits; +def : Pat<(srl GPRC:$rA, R32C:$rB), + (ROTQMBYBIr128 (ROTQMBIr128 GPRC:$rA, + (SFIr32 R32C:$rB, 0)), + (SFIr32 R32C:$rB, 0))>; + + //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // Rotate quad and mask by bits, immediate //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ class ROTQMBIIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b10011111100, OOL, IOL, "rotqmbii\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftQuad, pattern>; class ROTQMBIIVecInst<ValueType vectype>: ROTQMBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val), @@ -2907,7 +2803,7 @@ defm ROTQMBII: RotateMaskQuadByBitsImm; def ROTMAHv8i16: RRForm<0b01111010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), - "rotmah\t$rT, $rA, $rB", RotateShift, + "rotmah\t$rT, $rA, $rB", RotShiftVec, [/* see patterns below - $rB must be negated */]>; def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), R32C:$rB), @@ -2923,7 +2819,7 @@ def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), R8C:$rB), def ROTMAHr16: RRForm<0b01111010000, (outs R16C:$rT), (ins R16C:$rA, R32C:$rB), - "rotmah\t$rT, $rA, $rB", RotateShift, + "rotmah\t$rT, $rA, $rB", RotShiftVec, [/* see patterns below - $rB must be negated */]>; def : Pat<(sra R16C:$rA, R32C:$rB), @@ -2939,7 +2835,7 @@ def : Pat<(sra R16C:$rA, R8C:$rB), def ROTMAHIv8i16: RRForm<0b01111110000, (outs VECREG:$rT), (ins VECREG:$rA, rothNeg7imm:$val), - "rotmahi\t$rT, $rA, $val", RotateShift, + "rotmahi\t$rT, $rA, $val", RotShiftVec, [(set (v8i16 VECREG:$rT), (SPUvec_sra (v8i16 VECREG:$rA), (i32 uimm7:$val)))]>; @@ -2951,7 +2847,7 @@ def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (i8 uimm7:$val)), def ROTMAHIr16: RRForm<0b01111110000, (outs R16C:$rT), (ins R16C:$rA, rothNeg7imm_i16:$val), - "rotmahi\t$rT, $rA, $val", RotateShift, + "rotmahi\t$rT, $rA, $val", RotShiftVec, [(set R16C:$rT, (sra R16C:$rA, (i16 uimm7:$val)))]>; def : Pat<(sra R16C:$rA, (i32 imm:$val)), @@ -2962,7 +2858,7 @@ def : Pat<(sra R16C:$rA, (i8 imm:$val)), def ROTMAv4i32: RRForm<0b01011010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), - "rotma\t$rT, $rA, $rB", RotateShift, + "rotma\t$rT, $rA, $rB", RotShiftVec, [/* see patterns below - $rB must be negated */]>; def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), R32C:$rB), @@ -2978,7 +2874,7 @@ def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), R8C:$rB), def ROTMAr32: RRForm<0b01011010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB), - "rotma\t$rT, $rA, $rB", RotateShift, + "rotma\t$rT, $rA, $rB", RotShiftVec, [/* see patterns below - $rB must be negated */]>; def : Pat<(sra R32C:$rA, R32C:$rB), @@ -2995,7 +2891,7 @@ def : Pat<(sra R32C:$rA, R8C:$rB), class ROTMAIInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b01011110000, OOL, IOL, "rotmai\t$rT, $rA, $val", - RotateShift, pattern>; + RotShiftVec, pattern>; class ROTMAIVecInst<ValueType vectype, Operand intop, ValueType inttype>: ROTMAIInst<(outs VECREG:$rT), (ins VECREG:$rA, intop:$val), @@ -4010,7 +3906,7 @@ def FCGTf32 : "fcgt\t$rT, $rA, $rB", SPrecFP, [(set R32C:$rT, (setugt R32FP:$rA, R32FP:$rB))]>; -def : Pat<(setugt R32FP:$rA, R32FP:$rB), +def : Pat<(setogt R32FP:$rA, R32FP:$rB), (FCGTf32 R32FP:$rA, R32FP:$rB)>; def FCMGTf32 : @@ -4018,7 +3914,7 @@ def FCMGTf32 : "fcmgt\t$rT, $rA, $rB", SPrecFP, [(set R32C:$rT, (setugt (fabs R32FP:$rA), (fabs R32FP:$rB)))]>; -def : Pat<(setugt (fabs R32FP:$rA), (fabs R32FP:$rB)), +def : Pat<(setogt (fabs R32FP:$rA), (fabs R32FP:$rB)), (FCMGTf32 R32FP:$rA, R32FP:$rB)>; //-------------------------------------------------------------------------- @@ -4320,7 +4216,7 @@ def : Pat<(fabs (v4f32 VECREG:$rA)), // in the odd pipeline) //===----------------------------------------------------------------------===// -def ENOP : SPUInstr<(outs), (ins), "enop", ExecNOP> { +def ENOP : SPUInstr<(outs), (ins), "nop", ExecNOP> { let Pattern = []; let Inst{0-10} = 0b10000000010; @@ -4379,30 +4275,43 @@ def : Pat<(v2f64 (bitconvert (v2i64 VECREG:$src))), (v2f64 VECREG:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VECREG:$src))), (v2f64 VECREG:$src)>; def : Pat<(i128 (bitconvert (v16i8 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v8i16 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v4i32 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v2i64 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v4f32 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v2f64 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(v16i8 (bitconvert (i128 GPRC:$src))), - (v16i8 (ORvec_i128 GPRC:$src))>; + (v16i8 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v8i16 (bitconvert (i128 GPRC:$src))), - (v8i16 (ORvec_i128 GPRC:$src))>; + (v8i16 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v4i32 (bitconvert (i128 GPRC:$src))), - (v4i32 (ORvec_i128 GPRC:$src))>; + (v4i32 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v2i64 (bitconvert (i128 GPRC:$src))), - (v2i64 (ORvec_i128 GPRC:$src))>; + (v2i64 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v4f32 (bitconvert (i128 GPRC:$src))), - (v4f32 (ORvec_i128 GPRC:$src))>; + (v4f32 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v2f64 (bitconvert (i128 GPRC:$src))), - (v2f64 (ORvec_i128 GPRC:$src))>; + (v2f64 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; + +def : Pat<(i32 (bitconvert R32FP:$rA)), + (COPY_TO_REGCLASS R32FP:$rA, R32C)>; + +def : Pat<(f32 (bitconvert R32C:$rA)), + (COPY_TO_REGCLASS R32C:$rA, R32FP)>; + +def : Pat<(i64 (bitconvert R64FP:$rA)), + (COPY_TO_REGCLASS R64FP:$rA, R64C)>; + +def : Pat<(f64 (bitconvert R64C:$rA)), + (COPY_TO_REGCLASS R64C:$rA, R64FP)>; + //===----------------------------------------------------------------------===// // Instruction patterns: @@ -4453,11 +4362,12 @@ def : Pat<(i32 (zext R8C:$rSrc)), // zext 8->64: Zero extend bytes to double words def : Pat<(i64 (zext R8C:$rSrc)), - (ORi64_v2i64 (SELBv4i32 (ROTQMBYv4i32 - (ORv4i32_i32 (ANDIi8i32 R8C:$rSrc, 0xff)), + (COPY_TO_REGCLASS (SELBv4i32 (ROTQMBYv4i32 + (COPY_TO_REGCLASS + (ANDIi8i32 R8C:$rSrc,0xff), VECREG), 0x4), (ILv4i32 0x0), - (FSMBIv4i32 0x0f0f)))>; + (FSMBIv4i32 0x0f0f)), R64C)>; // anyext 8->16: Extend 8->16 bits, irrespective of sign, preserves high bits def : Pat<(i16 (anyext R8C:$rSrc)), @@ -4465,7 +4375,7 @@ def : Pat<(i16 (anyext R8C:$rSrc)), // anyext 8->32: Extend 8->32 bits, irrespective of sign, preserves high bits def : Pat<(i32 (anyext R8C:$rSrc)), - (ORIi8i32 R8C:$rSrc, 0)>; + (COPY_TO_REGCLASS R8C:$rSrc, R32C)>; // sext 16->64: Sign extend halfword to double word def : Pat<(sext_inreg R64C:$rSrc, i16), @@ -4489,7 +4399,7 @@ def : Pat<(i32 (zext (and R16C:$rSrc, 0xfff))), // anyext 16->32: Extend 16->32 bits, irrespective of sign def : Pat<(i32 (anyext R16C:$rSrc)), - (ORIi16i32 R16C:$rSrc, 0)>; + (COPY_TO_REGCLASS R16C:$rSrc, R32C)>; //===----------------------------------------------------------------------===// // Truncates: @@ -4498,61 +4408,61 @@ def : Pat<(i32 (anyext R16C:$rSrc)), //===----------------------------------------------------------------------===// def : Pat<(i8 (trunc GPRC:$src)), - (ORi8_v16i8 + (COPY_TO_REGCLASS (SHUFBgprc GPRC:$src, GPRC:$src, - (IOHLv4i32 (ILHUv4i32 0x0f0f), 0x0f0f)))>; + (IOHLv4i32 (ILHUv4i32 0x0f0f), 0x0f0f)), R8C)>; def : Pat<(i8 (trunc R64C:$src)), - (ORi8_v16i8 + (COPY_TO_REGCLASS (SHUFBv2i64_m32 - (ORv2i64_i64 R64C:$src), - (ORv2i64_i64 R64C:$src), - (IOHLv4i32 (ILHUv4i32 0x0707), 0x0707)))>; + (COPY_TO_REGCLASS R64C:$src, VECREG), + (COPY_TO_REGCLASS R64C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0707), 0x0707)), R8C)>; def : Pat<(i8 (trunc R32C:$src)), - (ORi8_v16i8 + (COPY_TO_REGCLASS (SHUFBv4i32_m32 - (ORv4i32_i32 R32C:$src), - (ORv4i32_i32 R32C:$src), - (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>; + (COPY_TO_REGCLASS R32C:$src, VECREG), + (COPY_TO_REGCLASS R32C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)), R8C)>; def : Pat<(i8 (trunc R16C:$src)), - (ORi8_v16i8 + (COPY_TO_REGCLASS (SHUFBv4i32_m32 - (ORv8i16_i16 R16C:$src), - (ORv8i16_i16 R16C:$src), - (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>; + (COPY_TO_REGCLASS R16C:$src, VECREG), + (COPY_TO_REGCLASS R16C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)), R8C)>; def : Pat<(i16 (trunc GPRC:$src)), - (ORi16_v8i16 + (COPY_TO_REGCLASS (SHUFBgprc GPRC:$src, GPRC:$src, - (IOHLv4i32 (ILHUv4i32 0x0e0f), 0x0e0f)))>; + (IOHLv4i32 (ILHUv4i32 0x0e0f), 0x0e0f)), R16C)>; def : Pat<(i16 (trunc R64C:$src)), - (ORi16_v8i16 + (COPY_TO_REGCLASS (SHUFBv2i64_m32 - (ORv2i64_i64 R64C:$src), - (ORv2i64_i64 R64C:$src), - (IOHLv4i32 (ILHUv4i32 0x0607), 0x0607)))>; + (COPY_TO_REGCLASS R64C:$src, VECREG), + (COPY_TO_REGCLASS R64C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0607), 0x0607)), R16C)>; def : Pat<(i16 (trunc R32C:$src)), - (ORi16_v8i16 + (COPY_TO_REGCLASS (SHUFBv4i32_m32 - (ORv4i32_i32 R32C:$src), - (ORv4i32_i32 R32C:$src), - (IOHLv4i32 (ILHUv4i32 0x0203), 0x0203)))>; + (COPY_TO_REGCLASS R32C:$src, VECREG), + (COPY_TO_REGCLASS R32C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0203), 0x0203)), R16C)>; def : Pat<(i32 (trunc GPRC:$src)), - (ORi32_v4i32 + (COPY_TO_REGCLASS (SHUFBgprc GPRC:$src, GPRC:$src, - (IOHLv4i32 (ILHUv4i32 0x0c0d), 0x0e0f)))>; + (IOHLv4i32 (ILHUv4i32 0x0c0d), 0x0e0f)), R32C)>; def : Pat<(i32 (trunc R64C:$src)), - (ORi32_v4i32 + (COPY_TO_REGCLASS (SHUFBv2i64_m32 - (ORv2i64_i64 R64C:$src), - (ORv2i64_i64 R64C:$src), - (IOHLv4i32 (ILHUv4i32 0x0405), 0x0607)))>; + (COPY_TO_REGCLASS R64C:$src, VECREG), + (COPY_TO_REGCLASS R64C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0405), 0x0607)), R32C)>; //===----------------------------------------------------------------------===// // Address generation: SPU, like PPC, has to split addresses into high and diff --git a/lib/Target/CellSPU/SPUMCAsmInfo.cpp b/lib/Target/CellSPU/SPUMCAsmInfo.cpp index 25ba88a..99aaeb0 100644 --- a/lib/Target/CellSPU/SPUMCAsmInfo.cpp +++ b/lib/Target/CellSPU/SPUMCAsmInfo.cpp @@ -24,9 +24,8 @@ SPULinuxMCAsmInfo::SPULinuxMCAsmInfo(const Target &T, StringRef TT) { GlobalPrefix = ""; PrivateGlobalPrefix = ".L"; - // Has leb128, .loc and .file + // Has leb128 HasLEB128 = true; - HasDotLocAndDotFile = true; SupportsDebugInformation = true; diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td index 647da30..a6e621f 100644 --- a/lib/Target/CellSPU/SPUNodes.td +++ b/lib/Target/CellSPU/SPUNodes.td @@ -19,16 +19,16 @@ def SPU_GenControl : SDTypeProfile<1, 1, []>; def SPUshufmask : SDNode<"SPUISD::SHUFFLE_MASK", SPU_GenControl, []>; def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPUCallSeq, - [SDNPHasChain, SDNPOutFlag]>; + [SDNPHasChain, SDNPOutGlue]>; def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_SPUCallSeq, - [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; //===----------------------------------------------------------------------===// // Operand constraints: //===----------------------------------------------------------------------===// def SDT_SPUCall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; def SPUcall : SDNode<"SPUISD::CALL", SDT_SPUCall, - [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; // Operand type constraints for vector shuffle/permute operations @@ -83,10 +83,6 @@ def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>; // SPUISelLowering.h): def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>; -// Shift left quadword by bits and bytes -def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>; -def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>; - // Vector shifts (ISD::SHL,SRL,SRA are for _integers_ only): def SPUvec_shl: SDNode<"ISD::SHL", SPUvecshift_type, []>; def SPUvec_srl: SDNode<"ISD::SRL", SPUvecshift_type, []>; @@ -105,6 +101,12 @@ def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT", def SPUrotbytes_left_bits : SDNode<"SPUISD::ROTBYTES_LEFT_BITS", SPUvecshift_type>; +// Shift entire quad left by bytes/bits. Zeros are shifted in on the right +// SHL_BITS the same as SHL for i128, but ISD::SHL is not implemented for i128 +def SPUshlquad_l_bytes: SDNode<"SPUISD::SHL_BYTES", SPUvecshift_type, []>; +def SPUshlquad_l_bits: SDNode<"SPUISD::SHL_BITS", SPUvecshift_type, []>; +def SPUsrl_bytes: SDNode<"SPUISD::SRL_BYTES", SPUvecshift_type, []>; + // SPU form select mask for bytes, immediate def SPUselmask: SDNode<"SPUISD::SELECT_MASK", SPUselmask_type, []>; @@ -154,4 +156,4 @@ class NoEncode<string E> { //===----------------------------------------------------------------------===// def retflag : SDNode<"SPUISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInFlag]>; + [SDNPHasChain, SDNPOptInGlue]>; diff --git a/lib/Target/CellSPU/SPUNopFiller.cpp b/lib/Target/CellSPU/SPUNopFiller.cpp new file mode 100644 index 0000000..e2bd2d7 --- /dev/null +++ b/lib/Target/CellSPU/SPUNopFiller.cpp @@ -0,0 +1,153 @@ +//===-- SPUNopFiller.cpp - Add nops/lnops to align the pipelines---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The final pass just before assembly printing. This pass is the last +// checkpoint where nops and lnops are added to the instruction stream to +// satisfy the dual issue requirements. The actual dual issue scheduling is +// done (TODO: nowhere, currently) +// +//===----------------------------------------------------------------------===// + +#include "SPU.h" +#include "SPUTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + struct SPUNopFiller : public MachineFunctionPass { + + TargetMachine &TM; + const TargetInstrInfo *TII; + const InstrItineraryData *IID; + bool isEvenPlace; // the instruction slot (mem address) at hand is even/odd + + static char ID; + SPUNopFiller(TargetMachine &tm) + : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()), + IID(tm.getInstrItineraryData()) + { + DEBUG( dbgs() << "********** SPU Nop filler **********\n" ; ); + } + + virtual const char *getPassName() const { + return "SPU nop/lnop Filler"; + } + + void runOnMachineBasicBlock(MachineBasicBlock &MBB); + + bool runOnMachineFunction(MachineFunction &F) { + isEvenPlace = true; //all functions get an .align 3 directive at start + for (MachineFunction::iterator FI = F.begin(), FE = F.end(); + FI != FE; ++FI) + runOnMachineBasicBlock(*FI); + return true; //never-ever do any more modifications, just print it! + } + + typedef enum { none = 0, // no more instructions in this function / BB + pseudo = 1, // this does not get executed + even = 2, + odd = 3 } SPUOpPlace; + SPUOpPlace getOpPlacement( MachineInstr &instr ); + + }; + char SPUNopFiller::ID = 0; + +} + +// Fill a BasicBlock to alignment. +// In the assebly we align the functions to 'even' adresses, but +// basic blocks have an implicit alignmnet. We hereby define +// basic blocks to have the same, even, alignment. +void SPUNopFiller:: +runOnMachineBasicBlock(MachineBasicBlock &MBB) +{ + assert( isEvenPlace && "basic block start from odd address"); + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) + { + SPUOpPlace this_optype, next_optype; + MachineBasicBlock::iterator J = I; + J++; + + this_optype = getOpPlacement( *I ); + next_optype = none; + while (J!=MBB.end()){ + next_optype = getOpPlacement( *J ); + ++J; + if (next_optype != pseudo ) + break; + } + + // padd: odd(wrong), even(wrong), ... + // to: nop(corr), odd(corr), even(corr)... + if( isEvenPlace && this_optype == odd && next_optype == even ) { + DEBUG( dbgs() <<"Adding NOP before: "; ); + DEBUG( I->dump(); ); + BuildMI(MBB, I, I->getDebugLoc(), TII->get(SPU::ENOP)); + isEvenPlace=false; + } + + // padd: even(wrong), odd(wrong), ... + // to: lnop(corr), even(corr), odd(corr)... + else if ( !isEvenPlace && this_optype == even && next_optype == odd){ + DEBUG( dbgs() <<"Adding LNOP before: "; ); + DEBUG( I->dump(); ); + BuildMI(MBB, I, I->getDebugLoc(), TII->get(SPU::LNOP)); + isEvenPlace=true; + } + + // now go to next mem slot + if( this_optype != pseudo ) + isEvenPlace = !isEvenPlace; + + } + + // padd basicblock end + if( !isEvenPlace ){ + MachineBasicBlock::iterator J = MBB.end(); + J--; + if (getOpPlacement( *J ) == odd) { + DEBUG( dbgs() <<"Padding basic block with NOP\n"; ); + BuildMI(MBB, J, J->getDebugLoc(), TII->get(SPU::ENOP)); + } + else { + J++; + DEBUG( dbgs() <<"Padding basic block with LNOP\n"; ); + BuildMI(MBB, J, DebugLoc(), TII->get(SPU::LNOP)); + } + isEvenPlace=true; + } +} + +FunctionPass *llvm::createSPUNopFillerPass(SPUTargetMachine &tm) { + return new SPUNopFiller(tm); +} + +// Figure out if 'instr' is executed in the even or odd pipeline +SPUNopFiller::SPUOpPlace +SPUNopFiller::getOpPlacement( MachineInstr &instr ) { + int sc = instr.getDesc().getSchedClass(); + const InstrStage *stage = IID->beginStage(sc); + unsigned FUs = stage->getUnits(); + SPUOpPlace retval; + + switch( FUs ) { + case 0: retval = pseudo; break; + case 1: retval = odd; break; + case 2: retval = even; break; + default: retval= pseudo; + assert( false && "got unknown FuncUnit\n"); + break; + }; + return retval; +} diff --git a/lib/Target/CellSPU/SPUOperands.td b/lib/Target/CellSPU/SPUOperands.td index e1a0358..96cde51 100644 --- a/lib/Target/CellSPU/SPUOperands.td +++ b/lib/Target/CellSPU/SPUOperands.td @@ -143,7 +143,7 @@ def immU16 : PatLeaf<(imm), [{ def imm18 : PatLeaf<(imm), [{ // imm18 predicate: True if the immediate fits into an 18-bit unsigned field. int Value = (int) N->getZExtValue(); - return ((Value & ((1 << 19) - 1)) == Value); + return isUInt<18>(Value); }]>; def lo16 : PatLeaf<(imm), [{ @@ -203,7 +203,7 @@ def FPimm_sext16 : SDNodeXForm<fpimm, [{ def FPimm_u18 : SDNodeXForm<fpimm, [{ float fval = N->getValueAPF().convertToFloat(); - return getI32Imm(FloatToBits(fval) & ((1 << 19) - 1)); + return getI32Imm(FloatToBits(fval) & ((1 << 18) - 1)); }]>; def fpimmSExt16 : PatLeaf<(fpimm), [{ @@ -225,7 +225,7 @@ def hi16_f32 : PatLeaf<(fpimm), [{ def fpimm18 : PatLeaf<(fpimm), [{ if (N->getValueType(0) == MVT::f32) { uint32_t Value = FloatToBits(N->getValueAPF().convertToFloat()); - return ((Value & ((1 << 19) - 1)) == Value); + return isUInt<18>(Value); } return false; @@ -654,7 +654,11 @@ def memrr : Operand<iPTR> { // A-form : abs (256K LSA offset) // D-form(2): [r+I7] (7-bit signed offset + reg) -def dform_addr : ComplexPattern<iPTR, 2, "SelectDFormAddr", [], []>; -def xform_addr : ComplexPattern<iPTR, 2, "SelectXFormAddr", [], []>; -def aform_addr : ComplexPattern<iPTR, 2, "SelectAFormAddr", [], []>; -def dform2_addr : ComplexPattern<iPTR, 2, "SelectDForm2Addr", [], []>; +def dform_addr : ComplexPattern<iPTR, 2, "SelectDFormAddr", + [], [SDNPWantRoot]>; +def xform_addr : ComplexPattern<iPTR, 2, "SelectXFormAddr", + [], [SDNPWantRoot]>; +def aform_addr : ComplexPattern<iPTR, 2, "SelectAFormAddr", + [], [SDNPWantRoot]>; +def dform2_addr : ComplexPattern<iPTR, 2, "SelectDForm2Addr", + [], [SDNPWantRoot]>; diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp index cf71891..0bdd50a 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.cpp +++ b/lib/Target/CellSPU/SPURegisterInfo.cpp @@ -18,7 +18,7 @@ #include "SPUInstrBuilder.h" #include "SPUSubtarget.h" #include "SPUMachineFunction.h" -#include "SPUFrameInfo.h" +#include "SPUFrameLowering.h" #include "llvm/Constants.h" #include "llvm/Type.h" #include "llvm/CodeGen/ValueTypes.h" @@ -30,7 +30,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ValueTypes.h" -#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -240,25 +240,6 @@ BitVector SPURegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Stack Frame Processing methods //===----------------------------------------------------------------------===// -// needsFP - Return true if the specified function should have a dedicated frame -// pointer register. This is true if the function has variable sized allocas or -// if frame pointer elimination is disabled. -// -static bool needsFP(const MachineFunction &MF) { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects(); -} - -//-------------------------------------------------------------------------- -// hasFP - Return true if the specified function actually has a dedicated frame -// pointer register. This is true if the function needs a frame pointer and has -// a non-zero stack size. -bool -SPURegisterInfo::hasFP(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - return MFI->getStackSize() && needsFP(MF); -} - //-------------------------------------------------------------------------- void SPURegisterInfo::eliminateCallFramePseudoInstr(MachineFunction &MF, @@ -302,7 +283,7 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, MachineOperand &MO = MI.getOperand(OpNo); // Offset is biased by $lr's slot at the bottom. - Offset += MO.getImm() + MFI->getStackSize() + SPUFrameInfo::minStackSize(); + Offset += MO.getImm() + MFI->getStackSize() + SPUFrameLowering::minStackSize(); assert((Offset & 0xf) == 0 && "16-byte alignment violated in eliminateFrameIndex"); @@ -329,225 +310,6 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, } } -/// determineFrameLayout - Determine the size of the frame and maximum call -/// frame size. -void -SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const -{ - MachineFrameInfo *MFI = MF.getFrameInfo(); - - // Get the number of bytes to allocate from the FrameInfo - unsigned FrameSize = MFI->getStackSize(); - - // Get the alignments provided by the target, and the maximum alignment - // (if any) of the fixed frame objects. - unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); - unsigned Align = std::max(TargetAlign, MFI->getMaxAlignment()); - assert(isPowerOf2_32(Align) && "Alignment is not power of 2"); - unsigned AlignMask = Align - 1; - - // Get the maximum call frame size of all the calls. - unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); - - // If we have dynamic alloca then maxCallFrameSize needs to be aligned so - // that allocations will be aligned. - if (MFI->hasVarSizedObjects()) - maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask; - - // Update maximum call frame size. - MFI->setMaxCallFrameSize(maxCallFrameSize); - - // Include call frame size in total. - FrameSize += maxCallFrameSize; - - // Make sure the frame is aligned. - FrameSize = (FrameSize + AlignMask) & ~AlignMask; - - // Update frame info. - MFI->setStackSize(FrameSize); -} - -void SPURegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) - const { - // Mark LR and SP unused, since the prolog spills them to stack and - // we don't want anyone else to spill them for us. - // - // Also, unless R2 is really used someday, don't spill it automatically. - MF.getRegInfo().setPhysRegUnused(SPU::R0); - MF.getRegInfo().setPhysRegUnused(SPU::R1); - MF.getRegInfo().setPhysRegUnused(SPU::R2); - - MachineFrameInfo *MFI = MF.getFrameInfo(); - const TargetRegisterClass *RC = &SPU::R32CRegClass; - RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); - - -} - -void SPURegisterInfo::emitPrologue(MachineFunction &MF) const -{ - MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB - MachineBasicBlock::iterator MBBI = MBB.begin(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - MachineModuleInfo &MMI = MF.getMMI(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); - - // Prepare for debug frame info. - bool hasDebugInfo = MMI.hasDebugInfo(); - MCSymbol *FrameLabel = 0; - - // Move MBBI back to the beginning of the function. - MBBI = MBB.begin(); - - // Work out frame sizes. - determineFrameLayout(MF); - int FrameSize = MFI->getStackSize(); - - assert((FrameSize & 0xf) == 0 - && "SPURegisterInfo::emitPrologue: FrameSize not aligned"); - - // the "empty" frame size is 16 - just the register scavenger spill slot - if (FrameSize > 16 || MFI->adjustsStack()) { - FrameSize = -(FrameSize + SPUFrameInfo::minStackSize()); - if (hasDebugInfo) { - // Mark effective beginning of when frame pointer becomes valid. - FrameLabel = MMI.getContext().CreateTempSymbol(); - BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(FrameLabel); - } - - // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp) - // for the ABI - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16) - .addReg(SPU::R1); - if (isInt<10>(FrameSize)) { - // Spill $sp to adjusted $sp - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize) - .addReg(SPU::R1); - // Adjust $sp by required amout - BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1) - .addImm(FrameSize); - } else if (isInt<16>(FrameSize)) { - // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use - // $r2 to adjust $sp: - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2) - .addImm(-16) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2) - .addImm(FrameSize); - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQXr32), SPU::R1) - .addReg(SPU::R2) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1) - .addReg(SPU::R1) - .addReg(SPU::R2); - BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2) - .addReg(SPU::R2) - .addImm(16); - BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2) - .addReg(SPU::R2) - .addReg(SPU::R1); - } else { - report_fatal_error("Unhandled frame size: " + Twine(FrameSize)); - } - - if (hasDebugInfo) { - std::vector<MachineMove> &Moves = MMI.getFrameMoves(); - - // Show update of SP. - MachineLocation SPDst(MachineLocation::VirtualFP); - MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize); - Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); - - // Add callee saved registers to move list. - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx()); - unsigned Reg = CSI[I].getReg(); - if (Reg == SPU::R0) continue; - MachineLocation CSDst(MachineLocation::VirtualFP, Offset); - MachineLocation CSSrc(Reg); - Moves.push_back(MachineMove(FrameLabel, CSDst, CSSrc)); - } - - // Mark effective beginning of when frame pointer is ready. - MCSymbol *ReadyLabel = MMI.getContext().CreateTempSymbol(); - BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(ReadyLabel); - - MachineLocation FPDst(SPU::R1); - MachineLocation FPSrc(MachineLocation::VirtualFP); - Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc)); - } - } else { - // This is a leaf function -- insert a branch hint iff there are - // sufficient number instructions in the basic block. Note that - // this is just a best guess based on the basic block's size. - if (MBB.size() >= (unsigned) SPUFrameInfo::branchHintPenalty()) { - MachineBasicBlock::iterator MBBI = prior(MBB.end()); - dl = MBBI->getDebugLoc(); - - // Insert terminator label - BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)) - .addSym(MMI.getContext().CreateTempSymbol()); - } - } -} - -void -SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const -{ - MachineBasicBlock::iterator MBBI = prior(MBB.end()); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - int FrameSize = MFI->getStackSize(); - int LinkSlotOffset = SPUFrameInfo::stackSlotSize(); - DebugLoc dl = MBBI->getDebugLoc(); - - assert(MBBI->getOpcode() == SPU::RET && - "Can only insert epilog into returning blocks"); - assert((FrameSize & 0xf) == 0 - && "SPURegisterInfo::emitEpilogue: FrameSize not aligned"); - - // the "empty" frame size is 16 - just the register scavenger spill slot - if (FrameSize > 16 || MFI->adjustsStack()) { - FrameSize = FrameSize + SPUFrameInfo::minStackSize(); - if (isInt<10>(FrameSize + LinkSlotOffset)) { - // Reload $lr, adjust $sp by required amount - // Note: We do this to slightly improve dual issue -- not by much, but it - // is an opportunity for dual issue. - BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0) - .addImm(FrameSize + LinkSlotOffset) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1) - .addReg(SPU::R1) - .addImm(FrameSize); - } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) { - // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use - // $r2 to adjust $sp: - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2) - .addImm(16) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2) - .addImm(FrameSize); - BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1) - .addReg(SPU::R1) - .addReg(SPU::R2); - BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0) - .addImm(16) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2). - addReg(SPU::R2) - .addImm(16); - BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2) - .addReg(SPU::R2) - .addReg(SPU::R1); - } else { - report_fatal_error("Unhandled frame size: " + Twine(FrameSize)); - } - } -} - unsigned SPURegisterInfo::getRARegister() const { @@ -560,26 +322,16 @@ SPURegisterInfo::getFrameRegister(const MachineFunction &MF) const return SPU::R1; } -void -SPURegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const -{ - // Initial state of the frame pointer is R1. - MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(SPU::R1, 0); - Moves.push_back(MachineMove(0, Dst, Src)); -} - - int SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { // FIXME: Most probably dwarf numbers differs for Linux and Darwin return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); } -int +int SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const { - switch(dFormOpcode) + switch(dFormOpcode) { case SPU::AIr32: return SPU::Ar32; case SPU::LQDr32: return SPU::LQXr32; @@ -602,10 +354,10 @@ SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const // TODO this is already copied from PPC. Could this convenience function // be moved to the RegScavenger class? -unsigned -SPURegisterInfo::findScratchRegister(MachineBasicBlock::iterator II, +unsigned +SPURegisterInfo::findScratchRegister(MachineBasicBlock::iterator II, RegScavenger *RS, - const TargetRegisterClass *RC, + const TargetRegisterClass *RC, int SPAdj) const { assert(RS && "Register scavenging must be on"); diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h index aedb769..641da04 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.h +++ b/lib/Target/CellSPU/SPURegisterInfo.h @@ -33,7 +33,7 @@ namespace llvm { public: SPURegisterInfo(const SPUSubtarget &subtarget, const TargetInstrInfo &tii); - + //! Translate a register's enum value to a register number /*! This method translates a register's enum value to it's regiser number, @@ -56,8 +56,6 @@ namespace llvm { //! Return the reserved registers BitVector getReservedRegs(const MachineFunction &MF) const; - //! Prediate: Target has dedicated frame pointer - bool hasFP(const MachineFunction &MF) const; //! Eliminate the call frame setup pseudo-instructions void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, @@ -65,21 +63,11 @@ namespace llvm { //! Convert frame indicies into machine operands void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, RegScavenger *RS = NULL) const; - //! Determine the frame's layour - void determineFrameLayout(MachineFunction &MF) const; - - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS = NULL) const; - //! Emit the function prologue - void emitPrologue(MachineFunction &MF) const; - //! Emit the function epilogue - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + //! Get return address register (LR, aka R0) unsigned getRARegister() const; //! Get the stack frame register (SP, aka R1) unsigned getFrameRegister(const MachineFunction &MF) const; - //! Perform target-specific stack frame setup. - void getInitialFrameState(std::vector<MachineMove> &Moves) const; //------------------------------------------------------------------------ // New methods added: diff --git a/lib/Target/CellSPU/SPUSchedule.td b/lib/Target/CellSPU/SPUSchedule.td index a0b581f..9cd3c23 100644 --- a/lib/Target/CellSPU/SPUSchedule.td +++ b/lib/Target/CellSPU/SPUSchedule.td @@ -32,11 +32,12 @@ def FPInt : InstrItinClass; // EVEN_UNIT (FP<->integer) def ByteOp : InstrItinClass; // EVEN_UNIT def IntegerOp : InstrItinClass; // EVEN_UNIT def IntegerMulDiv: InstrItinClass; // EVEN_UNIT -def RotateShift : InstrItinClass; // EVEN_UNIT +def RotShiftVec : InstrItinClass; // EVEN_UNIT Inter vector +def RotShiftQuad : InstrItinClass; // ODD_UNIT Entire quad def ImmLoad : InstrItinClass; // EVEN_UNIT /* Note: The itinerary for the Cell SPU is somewhat contrived... */ -def SPUItineraries : ProcessorItineraries<[ODD_UNIT, EVEN_UNIT], [ +def SPUItineraries : ProcessorItineraries<[ODD_UNIT, EVEN_UNIT], [], [ InstrItinData<LoadStore , [InstrStage<6, [ODD_UNIT]>]>, InstrItinData<BranchHints , [InstrStage<6, [ODD_UNIT]>]>, InstrItinData<BranchResolv, [InstrStage<4, [ODD_UNIT]>]>, @@ -51,7 +52,8 @@ def SPUItineraries : ProcessorItineraries<[ODD_UNIT, EVEN_UNIT], [ InstrItinData<FPInt , [InstrStage<2, [EVEN_UNIT]>]>, InstrItinData<ByteOp , [InstrStage<4, [EVEN_UNIT]>]>, InstrItinData<IntegerOp , [InstrStage<2, [EVEN_UNIT]>]>, - InstrItinData<RotateShift , [InstrStage<4, [EVEN_UNIT]>]>, + InstrItinData<RotShiftVec , [InstrStage<4, [EVEN_UNIT]>]>, + InstrItinData<RotShiftQuad, [InstrStage<4, [ODD_UNIT]>]>, InstrItinData<IntegerMulDiv,[InstrStage<7, [EVEN_UNIT]>]>, InstrItinData<ImmLoad , [InstrStage<2, [EVEN_UNIT]>]> ]>; diff --git a/lib/Target/CellSPU/SPUSubtarget.cpp b/lib/Target/CellSPU/SPUSubtarget.cpp index 0f18b7f..07c8352 100644 --- a/lib/Target/CellSPU/SPUSubtarget.cpp +++ b/lib/Target/CellSPU/SPUSubtarget.cpp @@ -14,6 +14,8 @@ #include "SPUSubtarget.h" #include "SPU.h" #include "SPUGenSubtarget.inc" +#include "llvm/ADT/SmallVector.h" +#include "SPURegisterInfo.h" using namespace llvm; @@ -34,3 +36,22 @@ SPUSubtarget::SPUSubtarget(const std::string &TT, const std::string &FS) : /// producing code for the JIT. void SPUSubtarget::SetJITMode() { } + +/// Enable PostRA scheduling for optimization levels -O2 and -O3. +bool SPUSubtarget::enablePostRAScheduler( + CodeGenOpt::Level OptLevel, + TargetSubtarget::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const { + Mode = TargetSubtarget::ANTIDEP_CRITICAL; + // CriticalPathsRCs seems to be the set of + // RegisterClasses that antidep breakings are performed for. + // Do it for all register classes + CriticalPathRCs.clear(); + CriticalPathRCs.push_back(&SPU::R8CRegClass); + CriticalPathRCs.push_back(&SPU::R16CRegClass); + CriticalPathRCs.push_back(&SPU::R32CRegClass); + CriticalPathRCs.push_back(&SPU::R32FPRegClass); + CriticalPathRCs.push_back(&SPU::R64CRegClass); + CriticalPathRCs.push_back(&SPU::VECREGRegClass); + return OptLevel >= CodeGenOpt::Default; +} diff --git a/lib/Target/CellSPU/SPUSubtarget.h b/lib/Target/CellSPU/SPUSubtarget.h index 88201c6..d7929302 100644 --- a/lib/Target/CellSPU/SPUSubtarget.h +++ b/lib/Target/CellSPU/SPUSubtarget.h @@ -81,9 +81,13 @@ namespace llvm { /// properties of this subtarget. const char *getTargetDataString() const { return "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128" - "-i16:16:128-i8:8:128-i1:8:128-a:0:128-v64:128:128-v128:128:128" + "-i16:16:128-i8:8:128-i1:8:128-a:0:128-v64:64:128-v128:128:128" "-s:128:128-n32:64"; } + + bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtarget::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const; }; } // End llvm namespace diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp index 480ec3f..3ed7361 100644 --- a/lib/Target/CellSPU/SPUTargetMachine.cpp +++ b/lib/Target/CellSPU/SPUTargetMachine.cpp @@ -29,7 +29,7 @@ extern "C" void LLVMInitializeCellSPUTarget() { } const std::pair<unsigned, int> * -SPUFrameInfo::getCalleeSaveSpillSlots(unsigned &NumEntries) const { +SPUFrameLowering::getCalleeSaveSpillSlots(unsigned &NumEntries) const { NumEntries = 1; return &LR[0]; } @@ -40,7 +40,7 @@ SPUTargetMachine::SPUTargetMachine(const Target &T, const std::string &TT, Subtarget(TT, FS), DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this), - FrameInfo(*this), + FrameLowering(Subtarget), TLInfo(*this), TSInfo(*this), InstrItins(Subtarget.getInstrItineraryData()) { @@ -59,3 +59,12 @@ bool SPUTargetMachine::addInstSelector(PassManagerBase &PM, PM.add(createSPUISelDag(*this)); return false; } + +// passes to run just before printing the assembly +bool SPUTargetMachine:: +addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) +{ + //align instructions with nops/lnops for dual issue + PM.add(createSPUNopFillerPass(*this)); + return true; +} diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h index 7e02701..75abd5e 100644 --- a/lib/Target/CellSPU/SPUTargetMachine.h +++ b/lib/Target/CellSPU/SPUTargetMachine.h @@ -18,14 +18,14 @@ #include "SPUInstrInfo.h" #include "SPUISelLowering.h" #include "SPUSelectionDAGInfo.h" -#include "SPUFrameInfo.h" +#include "SPUFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetData.h" namespace llvm { class PassManager; class GlobalValue; -class TargetFrameInfo; +class TargetFrameLowering; /// SPUTargetMachine /// @@ -33,7 +33,7 @@ class SPUTargetMachine : public LLVMTargetMachine { SPUSubtarget Subtarget; const TargetData DataLayout; SPUInstrInfo InstrInfo; - SPUFrameInfo FrameInfo; + SPUFrameLowering FrameLowering; SPUTargetLowering TLInfo; SPUSelectionDAGInfo TSInfo; InstrItineraryData InstrItins; @@ -48,8 +48,8 @@ public: virtual const SPUInstrInfo *getInstrInfo() const { return &InstrInfo; } - virtual const SPUFrameInfo *getFrameInfo() const { - return &FrameInfo; + virtual const SPUFrameLowering *getFrameLowering() const { + return &FrameLowering; } /*! \note Cell SPU does not support JIT today. It could support JIT at some @@ -75,13 +75,14 @@ public: return &DataLayout; } - virtual const InstrItineraryData getInstrItineraryData() const { - return InstrItins; + virtual const InstrItineraryData *getInstrItineraryData() const { + return &InstrItins; } // Pass Pipeline Configuration virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreEmitPass(PassManagerBase &, CodeGenOpt::Level); }; } // end namespace llvm |