41 files changed, 6530 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp b/contrib/llvm/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
new file mode 100644
index 0000000..1830213
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
@@ -0,0 +1,249 @@
+//===-- PTXInstPrinter.cpp - Convert PTX MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a PTX MCInst to a .ptx file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "PTXInstPrinter.h"
+#include "MCTargetDesc/PTXBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#include "PTXGenAsmWriter.inc"
+
+PTXInstPrinter::PTXInstPrinter(const MCAsmInfo &MAI,
+                               const MCInstrInfo &MII,
+                               const MCRegisterInfo &MRI,
+                               const MCSubtargetInfo &STI) :
+  MCInstPrinter(MAI, MII, MRI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+void PTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // Decode the register number into type and offset
+  unsigned RegSpace  = RegNo & 0x7;
+  unsigned RegType   = (RegNo >> 3) & 0x7;
+  unsigned RegOffset = RegNo >> 6;
+
+  // Print the register
+  OS << "%";
+
+  switch (RegSpace) {
+  default:
+    llvm_unreachable("Unknown register space!");
+  case PTXRegisterSpace::Reg:
+    switch (RegType) {
+    default:
+      llvm_unreachable("Unknown register type!");
+    case PTXRegisterType::Pred:
+      OS << "p";
+      break;
+    case PTXRegisterType::B16:
+      OS << "rh";
+      break;
+    case PTXRegisterType::B32:
+      OS << "r";
+      break;
+    case PTXRegisterType::B64:
+      OS << "rd";
+      break;
+    case PTXRegisterType::F32:
+      OS << "f";
+      break;
+    case PTXRegisterType::F64:
+      OS << "fd";
+      break;
+    }
+    break;
+  case PTXRegisterSpace::Return:
+    OS << "ret";
+    break;
+  case PTXRegisterSpace::Argument:
+    OS << "arg";
+    break;
+  }
+
+  OS << RegOffset;
+}
+
+void PTXInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
+  printPredicate(MI, O);
+  switch (MI->getOpcode()) {
+  default:
+    printInstruction(MI, O);
+    break;
+  case PTX::CALL:
+    printCall(MI, O);
+  }
+  O << ";";
+  printAnnotation(O, Annot);
+}
+
+void PTXInstPrinter::printPredicate(const MCInst *MI, raw_ostream &O) {
+  // The last two operands are the predicate operands
+  int RegIndex;
+  int OpIndex;
+
+  if (MI->getOpcode() == PTX::CALL) {
+    RegIndex = 0;
+    OpIndex  = 1;
+  } else {
+    RegIndex = MI->getNumOperands()-2;
+    OpIndex = MI->getNumOperands()-1;
+  }
+
+  int PredOp = MI->getOperand(OpIndex).getImm();
+  if (PredOp == PTXPredicate::None)
+    return;
+
+  if (PredOp == PTXPredicate::Negate)
+    O << '!';
+  else
+    O << '@';
+
+  printOperand(MI, RegIndex, O);
+}
+
+void PTXInstPrinter::printCall(const MCInst *MI, raw_ostream &O) {
+  O << "\tcall.uni\t";
+  // The first two operands are the predicate slot
+  unsigned Index = 2;
+  unsigned NumRets = MI->getOperand(Index++).getImm();
+
+  if (NumRets > 0) {
+    O << "(";
+    printOperand(MI, Index++, O);
+    for (unsigned i = 1; i < NumRets; ++i) {
+      O << ", ";
+      printOperand(MI, Index++, O);
+    }
+    O << "), ";
+  }
+
+  const MCExpr* Expr = MI->getOperand(Index++).getExpr();
+  unsigned NumArgs = MI->getOperand(Index++).getImm();
+  
+  // if the function call is to printf or puts, change to vprintf
+  if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+    const MCSymbol &Sym = SymRefExpr->getSymbol();
+    if (Sym.getName() == "printf" || Sym.getName() == "puts") {
+      O << "vprintf";
+    } else {
+      O << Sym.getName();
+    }
+  } else {
+    O << *Expr;
+  }
+  
+  O << ", (";
+
+  if (NumArgs > 0) {
+    printOperand(MI, Index++, O);
+    for (unsigned i = 1; i < NumArgs; ++i) {
+      O << ", ";
+      printOperand(MI, Index++, O);
+    }
+  }
+  O << ")";
+}
+
+void PTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    O << Op.getImm();
+  } else if (Op.isFPImm()) {
+    double Imm = Op.getFPImm();
+    APFloat FPImm(Imm);
+    APInt FPIntImm = FPImm.bitcastToAPInt();
+    O << "0D";
+    // PTX requires us to output the full 64 bits, even if the number is zero
+    if (FPIntImm.getZExtValue() > 0) {
+      O << FPIntImm.toString(16, false);
+    } else {
+      O << "0000000000000000";
+    }
+  } else if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    const MCExpr *Expr = Op.getExpr();
+    if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+      const MCSymbol &Sym = SymRefExpr->getSymbol();
+      O << Sym.getName();
+    } else {
+      O << *Op.getExpr();
+    }
+  }
+}
+
+void PTXInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  // By definition, operand OpNo+1 is an i32imm
+  const MCOperand &Op2 = MI->getOperand(OpNo+1);
+  printOperand(MI, OpNo, O);
+  if (Op2.getImm() == 0)
+    return; // don't print "+0"
+  O << "+" << Op2.getImm();
+}
+
+void PTXInstPrinter::printRoundingMode(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert (Op.isImm() && "Rounding modes must be immediate values");
+  switch (Op.getImm()) {
+  default:
+    llvm_unreachable("Unknown rounding mode!");
+  case PTXRoundingMode::RndDefault:
+    llvm_unreachable("FP rounding-mode pass did not handle instruction!");
+  case PTXRoundingMode::RndNone:
+    // Do not print anything.
+    break;
+  case PTXRoundingMode::RndNearestEven:
+    O << ".rn";
+    break;
+  case PTXRoundingMode::RndTowardsZero:
+    O << ".rz";
+    break;
+  case PTXRoundingMode::RndNegInf:
+    O << ".rm";
+    break;
+  case PTXRoundingMode::RndPosInf:
+    O << ".rp";
+    break;
+  case PTXRoundingMode::RndApprox:
+    O << ".approx";
+    break;
+  case PTXRoundingMode::RndNearestEvenInt:
+    O << ".rni";
+    break;
+  case PTXRoundingMode::RndTowardsZeroInt:
+    O << ".rzi";
+    break;
+  case PTXRoundingMode::RndNegInfInt:
+    O << ".rmi";
+    break;
+  case PTXRoundingMode::RndPosInfInt:
+    O << ".rpi";
+    break;
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/PTX/InstPrinter/PTXInstPrinter.h b/contrib/llvm/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
new file mode 100644
index 0000000..ea4d504
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
@@ -0,0 +1,45 @@
+//===- PTXInstPrinter.h - Convert PTX MCInst to assembly syntax -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints n PTX MCInst to a .ptx file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXINSTPRINTER_H
+#define PTXINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class PTXInstPrinter : public MCInstPrinter {
+public:
+  PTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printPredicate(const MCInst *MI, raw_ostream &O);
+  void printCall(const MCInst *MI, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRoundingMode(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
new file mode 100644
index 0000000..a3e0f32
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
@@ -0,0 +1,134 @@
+//===-- PTXBaseInfo.h - Top level definitions for PTX -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the PTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXBASEINFO_H
+#define PTXBASEINFO_H
+
+#include "PTXMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  namespace PTXStateSpace {
+    enum {
+      Global    = 0, // default to global state space
+      Constant  = 1,
+      Local     = 2,
+      Parameter = 3,
+      Shared    = 4
+    };
+  } // namespace PTXStateSpace
+
+  namespace PTXPredicate {
+    enum {
+      Normal = 0,
+      Negate = 1,
+      None   = 2
+    };
+  } // namespace PTXPredicate
+
+  /// Namespace to hold all target-specific flags.
+  namespace PTXRoundingMode {
+    // Instruction Flags
+    enum {
+      // Rounding Mode Flags
+      RndMask             = 15,
+      RndDefault          =  0, // ---
+      RndNone             =  1, // <NONE>
+      RndNearestEven      =  2, // .rn
+      RndTowardsZero      =  3, // .rz
+      RndNegInf           =  4, // .rm
+      RndPosInf           =  5, // .rp
+      RndApprox           =  6, // .approx
+      RndNearestEvenInt   =  7, // .rni
+      RndTowardsZeroInt   =  8, // .rzi
+      RndNegInfInt        =  9, // .rmi
+      RndPosInfInt        = 10  // .rpi
+    };
+  } // namespace PTXII
+
+  namespace PTXRegisterType {
+    // Register type encoded in MCOperands
+    enum {
+      Pred  = 0,
+      B16,
+      B32,
+      B64,
+      F32,
+      F64
+    };
+  } // namespace PTXRegisterType
+
+  namespace PTXRegisterSpace {
+    // Register space encoded in MCOperands
+    enum {
+      Reg = 0,
+      Local,
+      Param,
+      Argument,
+      Return
+    };
+  }
+
+  inline static void decodeRegisterName(raw_ostream &OS,
+                                        unsigned EncodedReg) {
+    OS << "%";
+
+    unsigned RegSpace  = EncodedReg & 0x7;
+    unsigned RegType   = (EncodedReg >> 3) & 0x7;
+    unsigned RegOffset = EncodedReg >> 6;
+
+    switch (RegSpace) {
+    default:
+      llvm_unreachable("Unknown register space!");
+    case PTXRegisterSpace::Reg:
+      switch (RegType) {
+      default:
+        llvm_unreachable("Unknown register type!");
+      case PTXRegisterType::Pred:
+        OS << "p";
+        break;
+      case PTXRegisterType::B16:
+        OS << "rh";
+        break;
+      case PTXRegisterType::B32:
+        OS << "r";
+        break;
+      case PTXRegisterType::B64:
+        OS << "rd";
+        break;
+      case PTXRegisterType::F32:
+        OS << "f";
+        break;
+      case PTXRegisterType::F64:
+        OS << "fd";
+        break;
+      }
+      break;
+    case PTXRegisterSpace::Return:
+      OS << "ret";
+      break;
+    case PTXRegisterSpace::Argument:
+      OS << "arg";
+      break;
+    }
+
+    OS << RegOffset;
+  }
+} // namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp
new file mode 100644
index 0000000..cdfbc80
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.cpp
@@ -0,0 +1,37 @@
+//===-- PTXMCAsmInfo.cpp - PTX asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the PTXMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTXMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void PTXMCAsmInfo::anchor() { }
+
+PTXMCAsmInfo::PTXMCAsmInfo(const Target &T, const StringRef &TT) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::ptx64)
+    PointerSize = 8;
+
+  CommentString = "//";
+
+  PrivateGlobalPrefix = "$L__";
+
+  AllowPeriodsInName = false;
+
+  HasSetDirective = false;
+
+  HasDotTypeDotSizeDirective = false;
+
+  HasSingleParameterDotFile = false;
+}
diff --git a/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.h b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.h
new file mode 100644
index 0000000..32ca069
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- PTXMCAsmInfo.h - PTX asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the PTXMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_MCASM_INFO_H
+#define PTX_MCASM_INFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  class PTXMCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit PTXMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+} // namespace llvm
+
+#endif // PTX_MCASM_INFO_H
diff --git a/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
new file mode 100644
index 0000000..08fb970
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
@@ -0,0 +1,98 @@
+//===-- PTXMCTargetDesc.cpp - PTX Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTXMCTargetDesc.h"
+#include "PTXMCAsmInfo.h"
+#include "InstPrinter/PTXInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "PTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "PTXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "PTXGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createPTXMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitPTXMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createPTXMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  // PTX does not have a return address register.
+  InitPTXMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *createPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                 StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitPTXMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createPTXMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI,
+                                             const MCSubtargetInfo &STI) {
+  assert(SyntaxVariant == 0 && "We only have one syntax variant");
+  return new PTXInstPrinter(MAI, MII, MRI, STI);
+}
+
+extern "C" void LLVMInitializePTXTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<PTXMCAsmInfo> X(ThePTX32Target);
+  RegisterMCAsmInfo<PTXMCAsmInfo> Y(ThePTX64Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(ThePTX32Target, createPTXMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(ThePTX64Target, createPTXMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(ThePTX32Target, createPTXMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(ThePTX64Target, createPTXMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(ThePTX32Target, createPTXMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(ThePTX64Target, createPTXMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(ThePTX32Target,
+                                          createPTXMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(ThePTX64Target,
+                                          createPTXMCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(ThePTX32Target, createPTXMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(ThePTX64Target, createPTXMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h
new file mode 100644
index 0000000..542638a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h
@@ -0,0 +1,36 @@
+//===-- PTXMCTargetDesc.h - PTX Target Descriptions ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXMCTARGETDESC_H
+#define PTXMCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target ThePTX32Target;
+extern Target ThePTX64Target;
+
+} // End llvm namespace
+
+// Defines symbolic names for PTX registers.
+#define GET_REGINFO_ENUM
+#include "PTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the PTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "PTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "PTXGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/PTX/PTX.h b/contrib/llvm/lib/Target/PTX/PTX.h
new file mode 100644
index 0000000..ffb92cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTX.h
@@ -0,0 +1,43 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// PTX back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_H
+#define PTX_H
+
+#include "MCTargetDesc/PTXBaseInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MachineInstr;
+  class MCInst;
+  class PTXAsmPrinter;
+  class PTXTargetMachine;
+  class FunctionPass;
+
+  FunctionPass *createPTXISelDag(PTXTargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+
+  FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM,
+                                       CodeGenOpt::Level OptLevel);
+
+  FunctionPass *createPTXFPRoundingModePass(PTXTargetMachine &TM,
+                                            CodeGenOpt::Level OptLevel);
+
+  FunctionPass *createPTXRegisterAllocator();
+
+  void LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                    PTXAsmPrinter &AP);
+
+} // namespace llvm;
+
+#endif // PTX_H
diff --git a/contrib/llvm/lib/Target/PTX/PTX.td b/contrib/llvm/lib/Target/PTX/PTX.td
new file mode 100644
index 0000000..994a68e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTX.td
@@ -0,0 +1,141 @@
+//===-- PTX.td - Describe the PTX Target Machine -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the PTX target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features
+//===----------------------------------------------------------------------===//
+
+//===- Architectural Features ---------------------------------------------===//
+
+def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true",
+                                     "Do not demote .f64 to .f32">;
+
+def FeatureNoFMA  : SubtargetFeature<"no-fma","SupportsFMA", "false",
+                                     "Disable Fused-Multiply Add">;
+
+//===- PTX Version --------------------------------------------------------===//
+
+def FeaturePTX20 : SubtargetFeature<"ptx20", "PTXVersion", "PTX_VERSION_2_0",
+                                    "Use PTX Language Version 2.0">;
+
+def FeaturePTX21 : SubtargetFeature<"ptx21", "PTXVersion", "PTX_VERSION_2_1",
+                                    "Use PTX Language Version 2.1">;
+
+def FeaturePTX22 : SubtargetFeature<"ptx22", "PTXVersion", "PTX_VERSION_2_2",
+                                    "Use PTX Language Version 2.2">;
+
+def FeaturePTX23 : SubtargetFeature<"ptx23", "PTXVersion", "PTX_VERSION_2_3",
+                                    "Use PTX Language Version 2.3">;
+
+//===- PTX Target ---------------------------------------------------------===//
+
+def FeatureSM10 : SubtargetFeature<"sm10", "PTXTarget", "PTX_SM_1_0",
+                                   "Use Shader Model 1.0">;
+def FeatureSM11 : SubtargetFeature<"sm11", "PTXTarget", "PTX_SM_1_1",
+                                   "Use Shader Model 1.1">;
+def FeatureSM12 : SubtargetFeature<"sm12", "PTXTarget", "PTX_SM_1_2",
+                                   "Use Shader Model 1.2">;
+def FeatureSM13 : SubtargetFeature<"sm13", "PTXTarget", "PTX_SM_1_3",
+                                   "Use Shader Model 1.3">;
+def FeatureSM20 : SubtargetFeature<"sm20", "PTXTarget", "PTX_SM_2_0",
+                                   "Use Shader Model 2.0", [FeatureDouble]>;
+def FeatureSM21 : SubtargetFeature<"sm21", "PTXTarget", "PTX_SM_2_1",
+                                   "Use Shader Model 2.1", [FeatureDouble]>;
+def FeatureSM22 : SubtargetFeature<"sm22", "PTXTarget", "PTX_SM_2_2",
+                                   "Use Shader Model 2.2", [FeatureDouble]>;
+def FeatureSM23 : SubtargetFeature<"sm23", "PTXTarget", "PTX_SM_2_3",
+                                   "Use Shader Model 2.3", [FeatureDouble]>;
+
+def FeatureCOMPUTE10 : SubtargetFeature<"compute10", "PTXTarget",
+                                        "PTX_COMPUTE_1_0",
+                                        "Use Compute Compatibility 1.0">;
+def FeatureCOMPUTE11 : SubtargetFeature<"compute11", "PTXTarget",
+                                        "PTX_COMPUTE_1_1",
+                                        "Use Compute Compatibility 1.1">;
+def FeatureCOMPUTE12 : SubtargetFeature<"compute12", "PTXTarget",
+                                        "PTX_COMPUTE_1_2",
+                                        "Use Compute Compatibility 1.2">;
+def FeatureCOMPUTE13 : SubtargetFeature<"compute13", "PTXTarget",
+                                        "PTX_COMPUTE_1_3",
+                                        "Use Compute Compatibility 1.3">;
+def FeatureCOMPUTE20 : SubtargetFeature<"compute20", "PTXTarget",
+                                        "PTX_COMPUTE_2_0",
+                                        "Use Compute Compatibility 2.0",
+                                        [FeatureDouble]>;
+
+//===----------------------------------------------------------------------===//
+// PTX supported processors
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+  : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+
+// Processor definitions for compute/shader models
+def : Proc<"compute_10", [FeatureCOMPUTE10]>;
+def : Proc<"compute_11", [FeatureCOMPUTE11]>;
+def : Proc<"compute_12", [FeatureCOMPUTE12]>;
+def : Proc<"compute_13", [FeatureCOMPUTE13]>;
+def : Proc<"compute_20", [FeatureCOMPUTE20]>;
+def : Proc<"sm_10",      [FeatureSM10]>;
+def : Proc<"sm_11",      [FeatureSM11]>;
+def : Proc<"sm_12",      [FeatureSM12]>;
+def : Proc<"sm_13",      [FeatureSM13]>;
+def : Proc<"sm_20",      [FeatureSM20]>;
+def : Proc<"sm_21",      [FeatureSM21]>;
+def : Proc<"sm_22",      [FeatureSM22]>;
+def : Proc<"sm_23",      [FeatureSM23]>;
+
+// Processor definitions for common GPU architectures
+def : Proc<"g80",        [FeatureSM10]>;
+def : Proc<"gt200",      [FeatureSM13]>;
+def : Proc<"gf100",      [FeatureSM20, FeatureDouble]>;
+def : Proc<"fermi",      [FeatureSM20, FeatureDouble]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "PTXRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "PTXInstrInfo.td"
+
+def PTXInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// PTX uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def PTXAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def PTX : Target {
+  let InstructionSet = PTXInstrInfo;
+  let AssemblyWriters = [PTXAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXAsmPrinter.cpp b/contrib/llvm/lib/Target/PTX/PTXAsmPrinter.cpp
new file mode 100644
index 0000000..0b6ac7b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -0,0 +1,561 @@
+//===-- PTXAsmPrinter.cpp - PTX LLVM assembly writer ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-asm-printer"
+
+#include "PTXAsmPrinter.h"
+#include "PTX.h"
+#include "PTXMachineFunctionInfo.h"
+#include "PTXParamManager.h"
+#include "PTXRegisterInfo.h"
+#include "PTXTargetMachine.h"
+#include "llvm/Argument.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static const char PARAM_PREFIX[] = "__param_";
+static const char RETURN_PREFIX[] = "__ret_";
+
+static const char *getRegisterTypeName(unsigned RegType) {
+  switch (RegType) {
+  default:
+    llvm_unreachable("Unknown register type");
+  case PTXRegisterType::Pred:
+    return ".pred";
+  case PTXRegisterType::B16:
+    return ".b16";
+  case PTXRegisterType::B32:
+    return ".b32";
+  case PTXRegisterType::B64:
+    return ".b64";
+  case PTXRegisterType::F32:
+    return ".f32";
+  case PTXRegisterType::F64:
+    return ".f64";
+  }
+}
+
+static const char *getStateSpaceName(unsigned addressSpace) {
+  switch (addressSpace) {
+  default: llvm_unreachable("Unknown state space");
+  case PTXStateSpace::Global:    return "global";
+  case PTXStateSpace::Constant:  return "const";
+  case PTXStateSpace::Local:     return "local";
+  case PTXStateSpace::Parameter: return "param";
+  case PTXStateSpace::Shared:    return "shared";
+  }
+}
+
+static const char *getTypeName(Type* type) {
+  while (true) {
+    switch (type->getTypeID()) {
+      default: llvm_unreachable("Unknown type");
+      case Type::FloatTyID: return ".f32";
+      case Type::DoubleTyID: return ".f64";
+      case Type::IntegerTyID:
+        switch (type->getPrimitiveSizeInBits()) {
+          default: llvm_unreachable("Unknown integer bit-width");
+          case 16: return ".u16";
+          case 32: return ".u32";
+          case 64: return ".u64";
+        }
+      case Type::ArrayTyID:
+      case Type::PointerTyID:
+        type = dyn_cast<SequentialType>(type)->getElementType();
+        break;
+    }
+  }
+  return NULL;
+}
+
+bool PTXAsmPrinter::doFinalization(Module &M) {
+  // XXX Temproarily remove global variables so that doFinalization() will not
+  // emit them again (global variables are emitted at beginning).
+
+  Module::GlobalListType &global_list = M.getGlobalList();
+  int i, n = global_list.size();
+  GlobalVariable **gv_array = new GlobalVariable* [n];
+
+  // first, back-up GlobalVariable in gv_array
+  i = 0;
+  for (Module::global_iterator I = global_list.begin(), E = global_list.end();
+       I != E; ++I)
+    gv_array[i++] = &*I;
+
+  // second, empty global_list
+  while (!global_list.empty())
+    global_list.remove(global_list.begin());
+
+  // call doFinalization
+  bool ret = AsmPrinter::doFinalization(M);
+
+  // now we restore global variables
+  for (i = 0; i < n; i ++)
+    global_list.insert(global_list.end(), gv_array[i]);
+
+  delete[] gv_array;
+  return ret;
+}
+
+void PTXAsmPrinter::EmitStartOfAsmFile(Module &M)
+{
+  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
+
+  // Emit the PTX .version and .target attributes
+  OutStreamer.EmitRawText(Twine("\t.version ") + ST.getPTXVersionString());
+  OutStreamer.EmitRawText(Twine("\t.target ") + ST.getTargetString() +
+                                (ST.supportsDouble() ? ""
+                                                     : ", map_f64_to_f32"));
+  // .address_size directive is optional, but it must immediately follow
+  // the .target directive if present within a module
+  if (ST.supportsPTX23()) {
+    const char *addrSize = ST.is64Bit() ? "64" : "32";
+    OutStreamer.EmitRawText(Twine("\t.address_size ") + addrSize);
+  }
+
+  OutStreamer.AddBlankLine();
+
+  // Define any .file directives
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+
+  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+       E = DbgFinder.compile_unit_end(); I != E; ++I) {
+    DICompileUnit DIUnit(*I);
+    StringRef FN = DIUnit.getFilename();
+    StringRef Dir = DIUnit.getDirectory();
+    GetOrCreateSourceID(FN, Dir);
+  }
+
+  OutStreamer.AddBlankLine();
+
+  // declare external functions
+  for (Module::const_iterator i = M.begin(), e = M.end();
+       i != e; ++i)
+    EmitFunctionDeclaration(i);
+  
+  // declare global variables
+  for (Module::const_global_iterator i = M.global_begin(), e = M.global_end();
+       i != e; ++i)
+    EmitVariableDeclaration(i);
+}
+
+void PTXAsmPrinter::EmitFunctionBodyStart() {
+  OutStreamer.EmitRawText(Twine("{"));
+
+  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+  const PTXParamManager &PM = MFI->getParamManager();
+
+  // Print register definitions
+  SmallString<128> regDefs;
+  raw_svector_ostream os(regDefs);
+  unsigned numRegs;
+
+  // pred
+  numRegs = MFI->countRegisters(PTXRegisterType::Pred, PTXRegisterSpace::Reg);
+  if(numRegs > 0)
+    os << "\t.reg .pred %p<" << numRegs << ">;\n";
+
+  // i16
+  numRegs = MFI->countRegisters(PTXRegisterType::B16, PTXRegisterSpace::Reg);
+  if(numRegs > 0)
+    os << "\t.reg .b16 %rh<" << numRegs << ">;\n";
+
+  // i32
+  numRegs = MFI->countRegisters(PTXRegisterType::B32, PTXRegisterSpace::Reg);
+  if(numRegs > 0)
+    os << "\t.reg .b32 %r<" << numRegs << ">;\n";
+
+  // i64
+  numRegs = MFI->countRegisters(PTXRegisterType::B64, PTXRegisterSpace::Reg);
+  if(numRegs > 0)
+    os << "\t.reg .b64 %rd<" << numRegs << ">;\n";
+
+  // f32
+  numRegs = MFI->countRegisters(PTXRegisterType::F32, PTXRegisterSpace::Reg);
+  if(numRegs > 0)
+    os << "\t.reg .f32 %f<" << numRegs << ">;\n";
+
+  // f64
+  numRegs = MFI->countRegisters(PTXRegisterType::F64, PTXRegisterSpace::Reg);
+  if(numRegs > 0)
+    os << "\t.reg .f64 %fd<" << numRegs << ">;\n";
+
+  // Local params
+  for (PTXParamManager::param_iterator i = PM.local_begin(), e = PM.local_end();
+       i != e; ++i)
+    os << "\t.param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i)
+       << ";\n";
+
+  OutStreamer.EmitRawText(os.str());
+
+
+  const MachineFrameInfo* FrameInfo = MF->getFrameInfo();
+  DEBUG(dbgs() << "Have " << FrameInfo->getNumObjects()
+               << " frame object(s)\n");
+  for (unsigned i = 0, e = FrameInfo->getNumObjects(); i != e; ++i) {
+    DEBUG(dbgs() << "Size of object: " << FrameInfo->getObjectSize(i) << "\n");
+    if (FrameInfo->getObjectSize(i) > 0) {
+      OutStreamer.EmitRawText("\t.local .align " +
+                              Twine(FrameInfo->getObjectAlignment(i)) +
+                              " .b8 __local" +
+                              Twine(i) +
+                              "[" +
+                              Twine(FrameInfo->getObjectSize(i)) +
+                              "];");
+    }
+  }
+
+  //unsigned Index = 1;
+  // Print parameter passing params
+  //for (PTXMachineFunctionInfo::param_iterator
+  //     i = MFI->paramBegin(), e = MFI->paramEnd(); i != e; ++i) {
+  //  std::string def = "\t.param .b";
+  //  def += utostr(*i);
+  //  def += " __ret_";
+  //  def += utostr(Index);
+  //  Index++;
+  //  def += ";";
+  //  OutStreamer.EmitRawText(Twine(def));
+  //}
+}
+
+void PTXAsmPrinter::EmitFunctionBodyEnd() {
+  OutStreamer.EmitRawText(Twine("}"));
+}
+
+void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MCInst TmpInst;
+  LowerPTXMachineInstrToMCInst(MI, TmpInst, *this);
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (EmitSpecialLLVMGlobal(gv))
+    return;
+
+  MCSymbol *gvsym = Mang->getSymbol(gv);
+
+  assert(gvsym->isUndefined() && "Cannot define a symbol twice!");
+
+  SmallString<128> decl;
+  raw_svector_ostream os(decl);
+
+  // check if it is defined in some other translation unit
+  if (gv->isDeclaration())
+    os << ".extern ";
+
+  // state space: e.g., .global
+  os << '.' << getStateSpaceName(gv->getType()->getAddressSpace()) << ' ';
+
+  // alignment (optional)
+  unsigned alignment = gv->getAlignment();
+  if (alignment != 0)
+    os << ".align " << gv->getAlignment() << ' ';
+
+
+  if (PointerType::classof(gv->getType())) {
+    PointerType* pointerTy = dyn_cast<PointerType>(gv->getType());
+    Type* elementTy = pointerTy->getElementType();
+
+    if (elementTy->isArrayTy()) {
+      assert(elementTy->isArrayTy() && "Only pointers to arrays are supported");
+
+      ArrayType* arrayTy = dyn_cast<ArrayType>(elementTy);
+      elementTy = arrayTy->getElementType();
+
+      unsigned numElements = arrayTy->getNumElements();
+
+      while (elementTy->isArrayTy()) {
+        arrayTy = dyn_cast<ArrayType>(elementTy);
+        elementTy = arrayTy->getElementType();
+
+        numElements *= arrayTy->getNumElements();
+      }
+
+      // FIXME: isPrimitiveType() == false for i16?
+      assert(elementTy->isSingleValueType() &&
+             "Non-primitive types are not handled");
+
+      // Find the size of the element in bits
+      unsigned elementSize = elementTy->getPrimitiveSizeInBits();
+
+      os << ".b" << elementSize << ' ' << gvsym->getName()
+         << '[' << numElements << ']';
+    } else {
+      os << ".b8" << gvsym->getName() << "[]";
+    }
+
+    // handle string constants (assume ConstantArray means string)
+    if (gv->hasInitializer()) {
+      const Constant *C = gv->getInitializer();
+      if (const ConstantArray *CA = dyn_cast<ConstantArray>(C)) {
+        os << " = {";
+
+        for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
+          if (i > 0)
+            os << ',';
+
+          os << "0x";
+          os.write_hex(cast<ConstantInt>(CA->getOperand(i))->getZExtValue());
+        }
+
+        os << '}';
+      }
+    }
+  } else {
+    // Note: this is currently the fall-through case and most likely generates
+    //       incorrect code.
+    os << getTypeName(gv->getType()) << ' ' << gvsym->getName();
+
+    if (isa<ArrayType>(gv->getType()) || isa<PointerType>(gv->getType()))
+      os << "[]";
+  }
+
+  os << ';';
+
+  OutStreamer.EmitRawText(os.str());
+  OutStreamer.AddBlankLine();
+}
+
+void PTXAsmPrinter::EmitFunctionEntryLabel() {
+  // The function label could have already been emitted if two symbols end up
+  // conflicting due to asm renaming.  Detect this and emit an error.
+  if (!CurrentFnSym->isUndefined())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' label emitted multiple times to assembly file");
+
+  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+  const PTXParamManager &PM = MFI->getParamManager();
+  const bool isKernel = MFI->isKernel();
+  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
+
+  SmallString<128> decl;
+  raw_svector_ostream os(decl);
+  os << (isKernel ? ".entry" : ".func");
+
+  if (!isKernel) {
+    os << " (";
+    if (ST.useParamSpaceForDeviceArgs()) {
+      for (PTXParamManager::param_iterator i = PM.ret_begin(), e = PM.ret_end(),
+           b = i; i != e; ++i) {
+        if (i != b)
+          os << ", ";
+
+        os << ".param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i);
+      }
+    } else {
+      for (PTXMachineFunctionInfo::reg_iterator
+           i = MFI->retreg_begin(), e = MFI->retreg_end(), b = i;
+           i != e; ++i) {
+        if (i != b)
+          os << ", ";
+
+        os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' '
+           << MFI->getRegisterName(*i);
+      }
+    }
+    os << ')';
+  }
+
+  // Print function name
+  os << ' ' << CurrentFnSym->getName() << " (";
+
+  const Function *F = MF->getFunction();
+
+  // Print parameters
+  if (isKernel || ST.useParamSpaceForDeviceArgs()) {
+    /*for (PTXParamManager::param_iterator i = PM.arg_begin(), e = PM.arg_end(),
+         b = i; i != e; ++i) {
+      if (i != b)
+        os << ", ";
+
+      os << ".param .b" << PM.getParamSize(*i) << ' ' << PM.getParamName(*i);
+    }*/
+    int Counter = 1;
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(),
+         b = i; i != e; ++i) {
+      if (i != b)
+        os << ", ";
+      const Type *ArgType = (*i).getType();
+      os << ".param .b";
+      if (ArgType->isPointerTy()) {
+        if (ST.is64Bit())
+          os << "64";
+        else
+          os << "32";
+      } else {
+        os << ArgType->getPrimitiveSizeInBits();
+      }
+      if (ArgType->isPointerTy() && ST.emitPtrAttribute()) {
+        const PointerType *PtrType = dyn_cast<const PointerType>(ArgType);
+        os << " .ptr";
+        switch (PtrType->getAddressSpace()) {
+        default:
+          llvm_unreachable("Unknown address space in argument");
+        case PTXStateSpace::Global:
+          os << " .global";
+          break;
+        case PTXStateSpace::Shared:
+          os << " .shared";
+          break;
+        }
+      }
+      os << " __param_" << Counter++;
+    }
+  } else {
+    for (PTXMachineFunctionInfo::reg_iterator
+         i = MFI->argreg_begin(), e = MFI->argreg_end(), b = i;
+         i != e; ++i) {
+      if (i != b)
+        os << ", ";
+
+      os << ".reg " << getRegisterTypeName(MFI->getRegisterType(*i)) << ' '
+         << MFI->getRegisterName(*i);
+    }
+  }
+  os << ')';
+
+  OutStreamer.EmitRawText(os.str());
+}
+
+void PTXAsmPrinter::EmitFunctionDeclaration(const Function* func)
+{
+  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
+	
+  std::string decl = "";
+
+  // hard-coded emission of extern vprintf function 
+  
+  if (func->getName() == "printf" || func->getName() == "puts") {		
+    decl += ".extern .func (.param .b32 __param_1) vprintf (.param .b";
+    if (ST.is64Bit())	
+      decl += "64";
+    else				
+      decl += "32";
+    decl += " __param_2, .param .b";
+    if (ST.is64Bit())	
+      decl += "64";
+    else				
+      decl += "32";
+    decl += " __param_3)\n";
+  }
+  
+  OutStreamer.EmitRawText(Twine(decl));
+}
+
+unsigned PTXAsmPrinter::GetOrCreateSourceID(StringRef FileName,
+                                            StringRef DirName) {
+  // If FE did not provide a file name, then assume stdin.
+  if (FileName.empty())
+    return GetOrCreateSourceID("<stdin>", StringRef());
+
+  // MCStream expects full path name as filename.
+  if (!DirName.empty() && !sys::path::is_absolute(FileName)) {
+    SmallString<128> FullPathName = DirName;
+    sys::path::append(FullPathName, FileName);
+    // Here FullPathName will be copied into StringMap by GetOrCreateSourceID.
+    return GetOrCreateSourceID(StringRef(FullPathName), StringRef());
+  }
+
+  StringMapEntry<unsigned> &Entry = SourceIdMap.GetOrCreateValue(FileName);
+  if (Entry.getValue())
+    return Entry.getValue();
+
+  unsigned SrcId = SourceIdMap.size();
+  Entry.setValue(SrcId);
+
+  // Print out a .file directive to specify files for .loc directives.
+  OutStreamer.EmitDwarfFileDirective(SrcId, "", Entry.getKey());
+
+  return SrcId;
+}
+
+MCOperand PTXAsmPrinter::GetSymbolRef(const MachineOperand &MO,
+                                      const MCSymbol *Symbol) {
+  const MCExpr *Expr;
+  Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand PTXAsmPrinter::lowerOperand(const MachineOperand &MO) {
+  MCOperand MCOp;
+  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+  unsigned EncodedReg;
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("Unknown operand type");
+  case MachineOperand::MO_Register:
+    if (MO.getReg() > 0) {
+      // Encode the register
+      EncodedReg = MFI->getEncodedRegister(MO.getReg());
+    } else {
+      EncodedReg = 0;
+    }
+    MCOp = MCOperand::CreateReg(EncodedReg);
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                 MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_FPImmediate:
+    APFloat Val = MO.getFPImm()->getValueAPF();
+    bool ignored;
+    Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    MCOp = MCOperand::CreateFPImm(Val.convertToDouble());
+    break;
+  }
+
+  return MCOp;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializePTXAsmPrinter() {
+  RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target);
+  RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target);
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXAsmPrinter.h b/contrib/llvm/lib/Target/PTX/PTXAsmPrinter.h
new file mode 100644
index 0000000..74c8d58
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXAsmPrinter.h
@@ -0,0 +1,57 @@
+//===-- PTXAsmPrinter.h - Print machine code to a PTX file ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// PTX Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXASMPRINTER_H
+#define PTXASMPRINTER_H
+
+#include "PTX.h"
+#include "PTXTargetMachine.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class LLVM_LIBRARY_VISIBILITY PTXAsmPrinter : public AsmPrinter {
+public:
+  explicit PTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) {}
+
+  const char *getPassName() const { return "PTX Assembly Printer"; }
+
+  bool doFinalization(Module &M);
+
+  virtual void EmitStartOfAsmFile(Module &M);
+  virtual void EmitFunctionBodyStart();
+  virtual void EmitFunctionBodyEnd();
+  virtual void EmitFunctionEntryLabel();
+  virtual void EmitInstruction(const MachineInstr *MI);
+
+  unsigned GetOrCreateSourceID(StringRef FileName,
+                               StringRef DirName);
+
+  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+  MCOperand lowerOperand(const MachineOperand &MO);
+
+private:
+  void EmitVariableDeclaration(const GlobalVariable *gv);
+  void EmitFunctionDeclaration(const Function* func);
+
+  StringMap<unsigned> SourceIdMap;
+}; // class PTXAsmPrinter
+} // namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXFPRoundingModePass.cpp b/contrib/llvm/lib/Target/PTX/PTXFPRoundingModePass.cpp
new file mode 100644
index 0000000..a21d172
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXFPRoundingModePass.cpp
@@ -0,0 +1,181 @@
+//===-- PTXFPRoundingModePass.cpp - Assign rounding modes pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a machine function pass that sets appropriate FP rounding
+// modes for all relevant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-fp-rounding-mode"
+
+#include "PTX.h"
+#include "PTXTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// NOTE: PTXFPRoundingModePass should be executed just before emission.
+
+namespace {
+  /// PTXFPRoundingModePass - Pass to assign appropriate FP rounding modes to
+  /// all FP instructions. Essentially, this pass just looks for all FP
+  /// instructions that have a rounding mode set to RndDefault, and sets an
+  /// appropriate rounding mode based on the target device.
+  ///
+  class PTXFPRoundingModePass : public MachineFunctionPass {
+    private:
+      static char ID;
+
+      typedef std::pair<unsigned, unsigned> RndModeDesc;
+
+      PTXTargetMachine& TargetMachine;
+      DenseMap<unsigned, RndModeDesc> Instrs;
+
+    public:
+      PTXFPRoundingModePass(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel)
+        : MachineFunctionPass(ID),
+          TargetMachine(TM) {
+        initializeMap();
+      }
+
+      virtual bool runOnMachineFunction(MachineFunction &MF);
+
+      virtual const char *getPassName() const {
+        return "PTX FP Rounding Mode Pass";
+      }
+
+    private:
+
+      void initializeMap();
+      void processInstruction(MachineInstr &MI);
+  }; // class PTXFPRoundingModePass
+} // end anonymous namespace
+
+using namespace llvm;
+
+char PTXFPRoundingModePass::ID = 0;
+
+bool PTXFPRoundingModePass::runOnMachineFunction(MachineFunction &MF) {
+  // Look at each basic block
+  for (MachineFunction::iterator bbi = MF.begin(), bbe = MF.end(); bbi != bbe;
+       ++bbi) {
+    MachineBasicBlock &MBB = *bbi;
+    // Look at each instruction
+    for (MachineBasicBlock::iterator ii = MBB.begin(), ie = MBB.end();
+         ii != ie; ++ii) {
+      MachineInstr &MI = *ii;
+      processInstruction(MI);
+    }
+  }
+  return false;
+}
+
+void PTXFPRoundingModePass::initializeMap() {
+  using namespace PTXRoundingMode;
+  const PTXSubtarget& ST = TargetMachine.getSubtarget<PTXSubtarget>();
+
+  // Build a map of default rounding mode for all instructions that need a
+  // rounding mode.
+  Instrs[PTX::FADDrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FADDri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FADDrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FADDri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+
+  Instrs[PTX::FNEGrr32] = std::make_pair(1U, (unsigned)RndNone);
+  Instrs[PTX::FNEGri32] = std::make_pair(1U, (unsigned)RndNone);
+  Instrs[PTX::FNEGrr64] = std::make_pair(1U, (unsigned)RndNone);
+  Instrs[PTX::FNEGri64] = std::make_pair(1U, (unsigned)RndNone);
+
+  unsigned FDivRndMode = ST.fdivNeedsRoundingMode() ? RndNearestEven : RndNone;
+  Instrs[PTX::FDIVrr32] = std::make_pair(1U, FDivRndMode);
+  Instrs[PTX::FDIVri32] = std::make_pair(1U, FDivRndMode);
+  Instrs[PTX::FDIVrr64] = std::make_pair(1U, FDivRndMode);
+  Instrs[PTX::FDIVri64] = std::make_pair(1U, FDivRndMode);
+
+  unsigned FMADRndMode = ST.fmadNeedsRoundingMode() ? RndNearestEven : RndNone;
+  Instrs[PTX::FMADrrr32] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrri32] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrii32] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrrr64] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrri64] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrii64] = std::make_pair(1U, FMADRndMode);
+
+  Instrs[PTX::FSQRTrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSQRTri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSQRTrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSQRTri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+
+  Instrs[PTX::FSINrr32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FSINri32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FSINrr64] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FSINri64] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSrr32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSri32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSrr64] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSri64] = std::make_pair(1U, (unsigned)RndApprox);
+
+  Instrs[PTX::CVTu16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+
+  Instrs[PTX::CVTf32u16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32s16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32u32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32s32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32u64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32s64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32f64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64u16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64s16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64u32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64s32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64u64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64s64] = std::make_pair(1U, (unsigned)RndNearestEven);
+}
+
+void PTXFPRoundingModePass::processInstruction(MachineInstr &MI) {
+  // Is this an instruction that needs a rounding mode?
+  if (Instrs.count(MI.getOpcode())) {
+    const RndModeDesc &Desc = Instrs[MI.getOpcode()];
+    // Get the rounding mode operand
+    MachineOperand &Op = MI.getOperand(Desc.first);
+    // Update the rounding mode if needed
+    if (Op.getImm() == PTXRoundingMode::RndDefault) {
+      Op.setImm(Desc.second);
+    }
+  }
+}
+
+FunctionPass *llvm::createPTXFPRoundingModePass(PTXTargetMachine &TM,
+                                                CodeGenOpt::Level OptLevel) {
+  return new PTXFPRoundingModePass(TM, OptLevel);
+}
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXFrameLowering.cpp b/contrib/llvm/lib/Target/PTX/PTXFrameLowering.cpp
new file mode 100644
index 0000000..e6e268e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXFrameLowering.cpp
@@ -0,0 +1,24 @@
+//===-- PTXFrameLowering.cpp - PTX Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PTX implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTXFrameLowering.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+using namespace llvm;
+
+void PTXFrameLowering::emitPrologue(MachineFunction &MF) const {
+}
+
+void PTXFrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXFrameLowering.h b/contrib/llvm/lib/Target/PTX/PTXFrameLowering.h
new file mode 100644
index 0000000..831e818
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXFrameLowering.h
@@ -0,0 +1,44 @@
+//===-- PTXFrameLowering.h - Define frame lowering for PTX -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_FRAMEINFO_H
+#define PTX_FRAMEINFO_H
+
+#include "PTX.h"
+#include "PTXSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class PTXSubtarget;
+
+class PTXFrameLowering : public TargetFrameLowering {
+protected:
+  const PTXSubtarget &STI;
+
+public:
+  explicit PTXFrameLowering(const PTXSubtarget &sti)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2),
+      STI(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  bool hasFP(const MachineFunction &MF) const { return false; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PTX/PTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PTX/PTXISelDAGToDAG.cpp
new file mode 100644
index 0000000..5c7ee29
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXISelDAGToDAG.cpp
@@ -0,0 +1,356 @@
+//===-- PTXISelDAGToDAG.cpp - A dag to dag inst selector for PTX ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the PTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTX.h"
+#include "PTXMachineFunctionInfo.h"
+#include "PTXTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+// PTXDAGToDAGISel - PTX specific code to select PTX machine
+// instructions for SelectionDAG operations.
+class PTXDAGToDAGISel : public SelectionDAGISel {
+  public:
+    PTXDAGToDAGISel(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel);
+
+    virtual const char *getPassName() const {
+      return "PTX DAG->DAG Pattern Instruction Selection";
+    }
+
+    SDNode *Select(SDNode *Node);
+
+    // Complex Pattern Selectors.
+    bool SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2);
+    bool SelectADDRri(SDValue &Addr, SDValue &Base, SDValue &Offset);
+    bool SelectADDRii(SDValue &Addr, SDValue &Base, SDValue &Offset);
+    bool SelectADDRlocal(SDValue &Addr, SDValue &Base, SDValue &Offset);
+
+    // Include the pieces auto'gened from the target description
+#include "PTXGenDAGISel.inc"
+
+  private:
+    // We need this only because we can't match intruction BRAdp
+    // pattern (PTXbrcond bb:$d, ...) in PTXInstrInfo.td
+    SDNode *SelectBRCOND(SDNode *Node);
+
+    SDNode *SelectREADPARAM(SDNode *Node);
+    SDNode *SelectWRITEPARAM(SDNode *Node);
+    SDNode *SelectFrameIndex(SDNode *Node);
+
+    bool isImm(const SDValue &operand);
+    bool SelectImm(const SDValue &operand, SDValue &imm);
+
+    const PTXSubtarget& getSubtarget() const;
+}; // class PTXDAGToDAGISel
+} // namespace
+
+// createPTXISelDag - This pass converts a legalized DAG into a
+// PTX-specific DAG, ready for instruction scheduling
+FunctionPass *llvm::createPTXISelDag(PTXTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new PTXDAGToDAGISel(TM, OptLevel);
+}
+
+PTXDAGToDAGISel::PTXDAGToDAGISel(PTXTargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel)
+  : SelectionDAGISel(TM, OptLevel) {}
+
+SDNode *PTXDAGToDAGISel::Select(SDNode *Node) {
+  switch (Node->getOpcode()) {
+    case ISD::BRCOND:
+      return SelectBRCOND(Node);
+    case PTXISD::READ_PARAM:
+      return SelectREADPARAM(Node);
+    case PTXISD::WRITE_PARAM:
+      return SelectWRITEPARAM(Node);
+    case ISD::FrameIndex:
+      return SelectFrameIndex(Node);
+    default:
+      return SelectCode(Node);
+  }
+}
+
+SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) {
+  assert(Node->getNumOperands() >= 3);
+
+  SDValue Chain  = Node->getOperand(0);
+  SDValue Pred   = Node->getOperand(1);
+  SDValue Target = Node->getOperand(2); // branch target
+  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::Normal, MVT::i32);
+  DebugLoc dl = Node->getDebugLoc();
+
+  assert(Target.getOpcode()  == ISD::BasicBlock);
+  assert(Pred.getValueType() == MVT::i1);
+
+  // Emit BRAdp
+  SDValue Ops[] = { Target, Pred, PredOp, Chain };
+  return CurDAG->getMachineNode(PTX::BRAdp, dl, MVT::Other, Ops, 4);
+}
+
+SDNode *PTXDAGToDAGISel::SelectREADPARAM(SDNode *Node) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue Index = Node->getOperand(1);
+
+  int OpCode;
+
+  // Get the type of parameter we are reading
+  EVT VT = Node->getValueType(0);
+  assert(VT.isSimple() && "READ_PARAM only implemented for MVT types");
+
+  MVT Type = VT.getSimpleVT();
+
+  if (Type == MVT::i1)
+    OpCode = PTX::READPARAMPRED;
+  else if (Type == MVT::i16)
+    OpCode = PTX::READPARAMI16;
+  else if (Type == MVT::i32)
+    OpCode = PTX::READPARAMI32;
+  else if (Type == MVT::i64)
+    OpCode = PTX::READPARAMI64;
+  else if (Type == MVT::f32)
+    OpCode = PTX::READPARAMF32;
+  else {
+    assert(Type == MVT::f64 && "Unexpected type!");
+    OpCode = PTX::READPARAMF64;
+  }
+
+  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
+  DebugLoc dl = Node->getDebugLoc();
+
+  SDValue Ops[] = { Index, Pred, PredOp, Chain };
+  return CurDAG->getMachineNode(OpCode, dl, VT, Ops, 4);
+}
+
+SDNode *PTXDAGToDAGISel::SelectWRITEPARAM(SDNode *Node) {
+
+  SDValue Chain = Node->getOperand(0);
+  SDValue Value = Node->getOperand(1);
+
+  int OpCode;
+
+  //Node->dumpr(CurDAG);
+
+  // Get the type of parameter we are writing
+  EVT VT = Value->getValueType(0);
+  assert(VT.isSimple() && "WRITE_PARAM only implemented for MVT types");
+
+  MVT Type = VT.getSimpleVT();
+
+  if (Type == MVT::i1)
+    OpCode = PTX::WRITEPARAMPRED;
+  else if (Type == MVT::i16)
+    OpCode = PTX::WRITEPARAMI16;
+  else if (Type == MVT::i32)
+    OpCode = PTX::WRITEPARAMI32;
+  else if (Type == MVT::i64)
+    OpCode = PTX::WRITEPARAMI64;
+  else if (Type == MVT::f32)
+    OpCode = PTX::WRITEPARAMF32;
+  else if (Type == MVT::f64)
+    OpCode = PTX::WRITEPARAMF64;
+  else
+    llvm_unreachable("Invalid type in SelectWRITEPARAM");
+
+  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
+  DebugLoc dl = Node->getDebugLoc();
+
+  SDValue Ops[] = { Value, Pred, PredOp, Chain };
+  SDNode* Ret = CurDAG->getMachineNode(OpCode, dl, MVT::Other, Ops, 4);
+
+  //dbgs() << "SelectWRITEPARAM produced:\n\t";
+  //Ret->dumpr(CurDAG);
+
+  return Ret;
+}
+
+SDNode *PTXDAGToDAGISel::SelectFrameIndex(SDNode *Node) {
+  int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+  //dbgs() << "Selecting FrameIndex at index " << FI << "\n";
+  //SDValue TFI = CurDAG->getTargetFrameIndex(FI, Node->getValueType(0));
+
+  PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+
+  SDValue FrameSymbol = CurDAG->getTargetExternalSymbol(MFI->getFrameSymbol(FI),
+                                                        Node->getValueType(0));
+
+  return FrameSymbol.getNode();
+}
+
+// Match memory operand of the form [reg+reg]
+bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) {
+  if (Addr.getOpcode() != ISD::ADD || Addr.getNumOperands() < 2 ||
+      isImm(Addr.getOperand(0)) || isImm(Addr.getOperand(1)))
+    return false;
+
+  assert(Addr.getValueType().isSimple() && "Type must be simple");
+
+  R1 = Addr;
+  R2 = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+
+  return true;
+}
+
+// Match memory operand of the form [reg], [imm+reg], and [reg+imm]
+bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base,
+                                   SDValue &Offset) {
+  // FrameIndex addresses are handled separately
+  //errs() << "SelectADDRri: ";
+  //Addr.getNode()->dumpr();
+  if (isa<FrameIndexSDNode>(Addr)) {
+    //errs() << "Failure\n";
+    return false;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    Base = Addr.getOperand(0);
+    if (isa<FrameIndexSDNode>(Base)) {
+      //errs() << "Failure\n";
+      return false;
+    }
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    //errs() << "Success\n";
+    return true;
+  }
+
+  /*if (Addr.getNumOperands() == 1) {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+    errs() << "Success\n";
+    return true;
+  }*/
+
+  //errs() << "SelectADDRri fails on: ";
+  //Addr.getNode()->dumpr();
+
+  if (isImm(Addr)) {
+    //errs() << "Failure\n";
+    return false;
+  }
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+
+  //errs() << "Success\n";
+  return true;
+
+  /*if (Addr.getOpcode() != ISD::ADD) {
+    // let SelectADDRii handle the [imm] case
+    if (isImm(Addr))
+      return false;
+    // it is [reg]
+
+    assert(Addr.getValueType().isSimple() && "Type must be simple");
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+
+    return true;
+  }
+
+  if (Addr.getNumOperands() < 2)
+    return false;
+
+  // let SelectADDRii handle the [imm+imm] case
+  if (isImm(Addr.getOperand(0)) && isImm(Addr.getOperand(1)))
+    return false;
+
+  // try [reg+imm] and [imm+reg]
+  for (int i = 0; i < 2; i ++)
+    if (SelectImm(Addr.getOperand(1-i), Offset)) {
+      Base = Addr.getOperand(i);
+      return true;
+    }
+
+  // neither [reg+imm] nor [imm+reg]
+  return false;*/
+}
+
+// Match memory operand of the form [imm+imm] and [imm]
+bool PTXDAGToDAGISel::SelectADDRii(SDValue &Addr, SDValue &Base,
+                                   SDValue &Offset) {
+  // is [imm+imm]?
+  if (Addr.getOpcode() == ISD::ADD) {
+    return SelectImm(Addr.getOperand(0), Base) &&
+           SelectImm(Addr.getOperand(1), Offset);
+  }
+
+  // is [imm]?
+  if (SelectImm(Addr, Base)) {
+    assert(Addr.getValueType().isSimple() && "Type must be simple");
+
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+
+    return true;
+  }
+
+  return false;
+}
+
+// Match memory operand of the form [reg], [imm+reg], and [reg+imm]
+bool PTXDAGToDAGISel::SelectADDRlocal(SDValue &Addr, SDValue &Base,
+                                      SDValue &Offset) {
+  //errs() << "SelectADDRlocal: ";
+  //Addr.getNode()->dumpr();
+  if (isa<FrameIndexSDNode>(Addr)) {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+    //errs() << "Success\n";
+    return true;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    Base = Addr.getOperand(0);
+    if (!isa<FrameIndexSDNode>(Base)) {
+      //errs() << "Failure\n";
+      return false;
+    }
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    //errs() << "Offset: ";
+    //Offset.getNode()->dumpr();
+    //errs() << "Success\n";
+    return true;
+  }
+
+  //errs() << "Failure\n";
+  return false;
+}
+
+bool PTXDAGToDAGISel::isImm(const SDValue &operand) {
+  return ConstantSDNode::classof(operand.getNode());
+}
+
+bool PTXDAGToDAGISel::SelectImm(const SDValue &operand, SDValue &imm) {
+  SDNode *node = operand.getNode();
+  if (!ConstantSDNode::classof(node))
+    return false;
+
+  ConstantSDNode *CN = cast<ConstantSDNode>(node);
+  imm = CurDAG->getTargetConstant(*CN->getConstantIntValue(),
+                                  operand.getValueType());
+  return true;
+}
+
+const PTXSubtarget& PTXDAGToDAGISel::getSubtarget() const
+{
+  return TM.getSubtarget<PTXSubtarget>();
+}
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXISelLowering.cpp b/contrib/llvm/lib/Target/PTX/PTXISelLowering.cpp
new file mode 100644
index 0000000..ef4455b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXISelLowering.cpp
@@ -0,0 +1,522 @@
+//===-- PTXISelLowering.cpp - PTX DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PTXTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTXISelLowering.h"
+#include "PTX.h"
+#include "PTXMachineFunctionInfo.h"
+#include "PTXRegisterInfo.h"
+#include "PTXSubtarget.h"
+#include "llvm/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+  // Set up the register classes.
+  addRegisterClass(MVT::i1,  PTX::RegPredRegisterClass);
+  addRegisterClass(MVT::i16, PTX::RegI16RegisterClass);
+  addRegisterClass(MVT::i32, PTX::RegI32RegisterClass);
+  addRegisterClass(MVT::i64, PTX::RegI64RegisterClass);
+  addRegisterClass(MVT::f32, PTX::RegF32RegisterClass);
+  addRegisterClass(MVT::f64, PTX::RegF64RegisterClass);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+  setMinFunctionAlignment(2);
+
+  // Let LLVM use loads/stores for all mem* operations
+  maxStoresPerMemcpy  = 4096;
+  maxStoresPerMemmove = 4096;
+  maxStoresPerMemset  = 4096;
+
+  ////////////////////////////////////
+  /////////// Expansion //////////////
+  ////////////////////////////////////
+
+  // (any/zero/sign) extload => load + (any/zero/sign) extend
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
+
+  // f32 extload => load + fextend
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
+  // f64 truncstore => trunc + store
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // sign_extend_inreg => sign_extend
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // br_cc => brcond
+
+  setOperationAction(ISD::BR_CC, MVT::Other, Expand);
+
+  // select_cc => setcc
+
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+  ////////////////////////////////////
+  //////////// Legal /////////////////
+  ////////////////////////////////////
+
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
+  ////////////////////////////////////
+  //////////// Custom ////////////////
+  ////////////////////////////////////
+
+  // customise setcc to use bitwise logic if possible
+
+  //setOperationAction(ISD::SETCC, MVT::i1, Custom);
+  setOperationAction(ISD::SETCC, MVT::i1, Legal);
+
+  // customize translation of memory addresses
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+}
+
+EVT PTXTargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i1;
+}
+
+SDValue PTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+    default:
+      llvm_unreachable("Unimplemented operand");
+    case ISD::SETCC:
+      return LowerSETCC(Op, DAG);
+    case ISD::GlobalAddress:
+      return LowerGlobalAddress(Op, DAG);
+  }
+}
+
+const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+    default:
+      llvm_unreachable("Unknown opcode");
+    case PTXISD::COPY_ADDRESS:
+      return "PTXISD::COPY_ADDRESS";
+    case PTXISD::LOAD_PARAM:
+      return "PTXISD::LOAD_PARAM";
+    case PTXISD::STORE_PARAM:
+      return "PTXISD::STORE_PARAM";
+    case PTXISD::READ_PARAM:
+      return "PTXISD::READ_PARAM";
+    case PTXISD::WRITE_PARAM:
+      return "PTXISD::WRITE_PARAM";
+    case PTXISD::EXIT:
+      return "PTXISD::EXIT";
+    case PTXISD::RET:
+      return "PTXISD::RET";
+    case PTXISD::CALL:
+      return "PTXISD::CALL";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                      Custom Lower Operation
+//===----------------------------------------------------------------------===//
+
+SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i1 && "SetCC type must be 1-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  //ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  // Look for X == 0, X == 1, X != 0, or X != 1
+  // We can simplify these to bitwise logic
+
+  //if (Op1.getOpcode() == ISD::Constant &&
+  //    (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+  //     cast<ConstantSDNode>(Op1)->isNullValue()) &&
+  //    (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+  //
+  //  return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1);
+  //}
+
+  //ConstantSDNode* COp1 = cast<ConstantSDNode>(Op1);
+  //if(COp1 && COp1->getZExtValue() == 1) {
+  //  if(CC == ISD::SETNE) {
+  //    return DAG.getNode(PTX::XORripreds, dl, MVT::i1, Op0);
+  //  }
+  //}
+
+  llvm_unreachable("setcc was not matched by a pattern!");
+
+  return DAG.getNode(ISD::SETCC, dl, MVT::i1, Op0, Op1, Op2);
+}
+
+SDValue PTXTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  assert(PtrVT.isSimple() && "Pointer must be to primitive type.");
+
+  SDValue targetGlobal = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+  SDValue movInstr = DAG.getNode(PTXISD::COPY_ADDRESS,
+                                 dl,
+                                 PtrVT.getSimpleVT(),
+                                 targetGlobal);
+
+  return movInstr;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue PTXTargetLowering::
+  LowerFormalArguments(SDValue Chain,
+                       CallingConv::ID CallConv,
+                       bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       DebugLoc dl,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const {
+  if (isVarArg) llvm_unreachable("PTX does not support varargs");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
+  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  PTXParamManager &PM = MFI->getParamManager();
+
+  switch (CallConv) {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::PTX_Kernel:
+      MFI->setKernel(true);
+      break;
+    case CallingConv::PTX_Device:
+      MFI->setKernel(false);
+      break;
+  }
+
+  // We do one of two things here:
+  // IsKernel || SM >= 2.0  ->  Use param space for arguments
+  // SM < 2.0               ->  Use registers for arguments
+  if (MFI->isKernel() || ST.useParamSpaceForDeviceArgs()) {
+    // We just need to emit the proper LOAD_PARAM ISDs
+    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+      assert((!MFI->isKernel() || Ins[i].VT != MVT::i1) &&
+             "Kernels cannot take pred operands");
+
+      unsigned ParamSize = Ins[i].VT.getStoreSizeInBits();
+      unsigned Param = PM.addArgumentParam(ParamSize);
+      const std::string &ParamName = PM.getParamName(Param);
+      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                       MVT::Other);
+      SDValue ArgValue = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
+                                     ParamValue);
+      InVals.push_back(ArgValue);
+    }
+  }
+  else {
+    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+      EVT                        RegVT = Ins[i].VT;
+      const TargetRegisterClass* TRC   = getRegClassFor(RegVT);
+      unsigned                   RegType;
+
+      // Determine which register class we need
+      if (RegVT == MVT::i1)
+        RegType = PTXRegisterType::Pred;
+      else if (RegVT == MVT::i16)
+        RegType = PTXRegisterType::B16;
+      else if (RegVT == MVT::i32)
+        RegType = PTXRegisterType::B32;
+      else if (RegVT == MVT::i64)
+        RegType = PTXRegisterType::B64;
+      else if (RegVT == MVT::f32)
+        RegType = PTXRegisterType::F32;
+      else if (RegVT == MVT::f64)
+        RegType = PTXRegisterType::F64;
+      else
+        llvm_unreachable("Unknown parameter type");
+
+      // Use a unique index in the instruction to prevent instruction folding.
+      // Yes, this is a hack.
+      SDValue Index = DAG.getTargetConstant(i, MVT::i32);
+      unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
+      SDValue ArgValue = DAG.getNode(PTXISD::READ_PARAM, dl, RegVT, Chain,
+                                     Index);
+
+      InVals.push_back(ArgValue);
+
+      MFI->addRegister(Reg, RegType, PTXRegisterSpace::Argument);
+    }
+  }
+
+  return Chain;
+}
+
+SDValue PTXTargetLowering::
+  LowerReturn(SDValue Chain,
+              CallingConv::ID CallConv,
+              bool isVarArg,
+              const SmallVectorImpl<ISD::OutputArg> &Outs,
+              const SmallVectorImpl<SDValue> &OutVals,
+              DebugLoc dl,
+              SelectionDAG &DAG) const {
+  if (isVarArg) llvm_unreachable("PTX does not support varargs");
+
+  switch (CallConv) {
+    default:
+      llvm_unreachable("Unsupported calling convention.");
+    case CallingConv::PTX_Kernel:
+      assert(Outs.size() == 0 && "Kernel must return void.");
+      return DAG.getNode(PTXISD::EXIT, dl, MVT::Other, Chain);
+    case CallingConv::PTX_Device:
+      assert(Outs.size() <= 1 && "Can at most return one value.");
+      break;
+  }
+
+  MachineFunction& MF = DAG.getMachineFunction();
+  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  PTXParamManager &PM = MFI->getParamManager();
+
+  SDValue Flag;
+  const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
+
+  if (ST.useParamSpaceForDeviceArgs()) {
+    assert(Outs.size() < 2 && "Device functions can return at most one value");
+
+    if (Outs.size() == 1) {
+      unsigned ParamSize = OutVals[0].getValueType().getSizeInBits();
+      unsigned Param = PM.addReturnParam(ParamSize);
+      const std::string &ParamName = PM.getParamName(Param);
+      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                       MVT::Other);
+      Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
+                          ParamValue, OutVals[0]);
+    }
+  } else {
+    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+      EVT                  RegVT = Outs[i].VT;
+      const TargetRegisterClass* TRC;
+      unsigned             RegType;
+
+      // Determine which register class we need
+      if (RegVT == MVT::i1) {
+        TRC = PTX::RegPredRegisterClass;
+        RegType = PTXRegisterType::Pred;
+      }
+      else if (RegVT == MVT::i16) {
+        TRC = PTX::RegI16RegisterClass;
+        RegType = PTXRegisterType::B16;
+      }
+      else if (RegVT == MVT::i32) {
+        TRC = PTX::RegI32RegisterClass;
+        RegType = PTXRegisterType::B32;
+      }
+      else if (RegVT == MVT::i64) {
+        TRC = PTX::RegI64RegisterClass;
+        RegType = PTXRegisterType::B64;
+      }
+      else if (RegVT == MVT::f32) {
+        TRC = PTX::RegF32RegisterClass;
+        RegType = PTXRegisterType::F32;
+      }
+      else if (RegVT == MVT::f64) {
+        TRC = PTX::RegF64RegisterClass;
+        RegType = PTXRegisterType::F64;
+      }
+      else {
+        llvm_unreachable("Unknown parameter type");
+      }
+
+      unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
+
+      SDValue Copy = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i]/*, Flag*/);
+      SDValue OutReg = DAG.getRegister(Reg, RegVT);
+
+      Chain = DAG.getNode(PTXISD::WRITE_PARAM, dl, MVT::Other, Copy, OutReg);
+
+      MFI->addRegister(Reg, RegType, PTXRegisterSpace::Return);
+    }
+  }
+
+  if (Flag.getNode() == 0) {
+    return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain);
+  }
+  else {
+    return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag);
+  }
+}
+
+SDValue
+PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool doesNotRet, bool &isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) const {
+
+  MachineFunction& MF = DAG.getMachineFunction();
+  PTXMachineFunctionInfo *PTXMFI = MF.getInfo<PTXMachineFunctionInfo>();
+  PTXParamManager &PM = PTXMFI->getParamManager();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  assert(getTargetMachine().getSubtarget<PTXSubtarget>().callsAreHandled() &&
+         "Calls are not handled for the target device");
+
+  // Identify the callee function
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+  const Function *function = cast<Function>(GV);
+
+  // allow non-device calls only for printf
+  bool isPrintf = function->getName() == "printf" || function->getName() == "puts";
+
+  assert((isPrintf || function->getCallingConv() == CallingConv::PTX_Device) &&
+			 "PTX function calls must be to PTX device functions");
+
+  unsigned outSize = isPrintf ? 2 : Outs.size();
+
+  std::vector<SDValue> Ops;
+  // The layout of the ops will be [Chain, #Ins, Ins, Callee, #Outs, Outs]
+  Ops.resize(outSize + Ins.size() + 4);
+
+  Ops[0] = Chain;
+
+  // Identify the callee function
+  Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+  Ops[Ins.size()+2] = Callee;
+
+  // #Outs
+  Ops[Ins.size()+3] = DAG.getTargetConstant(outSize, MVT::i32);
+
+  if (isPrintf) {
+    // first argument is the address of the global string variable in memory
+    unsigned Param0 = PM.addLocalParam(getPointerTy().getSizeInBits());
+    SDValue ParamValue0 = DAG.getTargetExternalSymbol(PM.getParamName(Param0).c_str(),
+                                                      MVT::Other);
+    Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
+                        ParamValue0, OutVals[0]);
+    Ops[Ins.size()+4] = ParamValue0;
+
+    // alignment is the maximum size of all the arguments
+    unsigned alignment = 0;
+    for (unsigned i = 1; i < OutVals.size(); ++i) {
+      alignment = std::max(alignment,
+    		               OutVals[i].getValueType().getSizeInBits());
+    }
+
+    // size is the alignment multiplied by the number of arguments
+    unsigned size = alignment * (OutVals.size() - 1);
+
+    // second argument is the address of the stack object (unless no arguments)
+    unsigned Param1 = PM.addLocalParam(getPointerTy().getSizeInBits());
+    SDValue ParamValue1 = DAG.getTargetExternalSymbol(PM.getParamName(Param1).c_str(),
+                                                      MVT::Other);
+    Ops[Ins.size()+5] = ParamValue1;
+
+    if (size > 0)
+    {
+      // create a local stack object to store the arguments
+      unsigned StackObject = MFI->CreateStackObject(size / 8, alignment / 8, false);
+      SDValue FrameIndex = DAG.getFrameIndex(StackObject, getPointerTy());
+
+      // store each of the arguments to the stack in turn
+      for (unsigned int i = 1; i != OutVals.size(); i++) {
+        SDValue FrameAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameIndex, DAG.getTargetConstant((i - 1) * 8, getPointerTy()));
+        Chain = DAG.getStore(Chain, dl, OutVals[i], FrameAddr,
+                             MachinePointerInfo(),
+                             false, false, 0);
+      }
+
+      // copy the address of the local frame index to get the address in non-local space
+      SDValue genericAddr = DAG.getNode(PTXISD::COPY_ADDRESS, dl, getPointerTy(), FrameIndex);
+
+      // store this address in the second argument
+      Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain, ParamValue1, genericAddr);
+    }
+  }
+  else
+  {
+	  // Generate STORE_PARAM nodes for each function argument.  In PTX, function
+	  // arguments are explicitly stored into .param variables and passed as
+	  // arguments. There is no register/stack-based calling convention in PTX.
+	  for (unsigned i = 0; i != OutVals.size(); ++i) {
+		unsigned Size = OutVals[i].getValueType().getSizeInBits();
+		unsigned Param = PM.addLocalParam(Size);
+		const std::string &ParamName = PM.getParamName(Param);
+		SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+														 MVT::Other);
+		Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
+							ParamValue, OutVals[i]);
+		Ops[i+Ins.size()+4] = ParamValue;
+	  }
+  }
+
+  std::vector<SDValue> InParams;
+
+  // Generate list of .param variables to hold the return value(s).
+  Ops[1] = DAG.getTargetConstant(Ins.size(), MVT::i32);
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    unsigned Size = Ins[i].VT.getStoreSizeInBits();
+    unsigned Param = PM.addLocalParam(Size);
+    const std::string &ParamName = PM.getParamName(Param);
+    SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                     MVT::Other);
+    Ops[i+2] = ParamValue;
+    InParams.push_back(ParamValue);
+  }
+
+  Ops[0] = Chain;
+
+  // Create the CALL node.
+  Chain = DAG.getNode(PTXISD::CALL, dl, MVT::Other, &Ops[0], Ops.size());
+
+  // Create the LOAD_PARAM nodes that retrieve the function return value(s).
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    SDValue Load = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
+                               InParams[i]);
+    InVals.push_back(Load);
+  }
+
+  return Chain;
+}
+
+unsigned PTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT) {
+  // All arguments consist of one "register," regardless of the type.
+  return 1;
+}
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXISelLowering.h b/contrib/llvm/lib/Target/PTX/PTXISelLowering.h
new file mode 100644
index 0000000..33220f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXISelLowering.h
@@ -0,0 +1,82 @@
+//===-- PTXISelLowering.h - PTX DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_ISEL_LOWERING_H
+#define PTX_ISEL_LOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace PTXISD {
+  enum NodeType {
+    FIRST_NUMBER = ISD::BUILTIN_OP_END,
+    LOAD_PARAM,
+    STORE_PARAM,
+    READ_PARAM,
+    WRITE_PARAM,
+    EXIT,
+    RET,
+    COPY_ADDRESS,
+    CALL
+  };
+}                               // namespace PTXISD
+
+class PTXTargetLowering : public TargetLowering {
+  public:
+    explicit PTXTargetLowering(TargetMachine &TM);
+
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    virtual SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv,
+                           bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl,
+                           SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv,
+                  bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl,
+                  SelectionDAG &DAG) const;
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+                bool isVarArg, bool doesNotRet, bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual EVT getSetCCResultType(EVT VT) const;
+
+    virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT);
+
+  private:
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+}; // class PTXTargetLowering
+} // namespace llvm
+
+#endif // PTX_ISEL_LOWERING_H
diff --git a/contrib/llvm/lib/Target/PTX/PTXInstrFormats.td b/contrib/llvm/lib/Target/PTX/PTXInstrFormats.td
new file mode 100644
index 0000000..267e834
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXInstrFormats.td
@@ -0,0 +1,51 @@
+//===-- PTXInstrFormats.td - PTX Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+// Rounding Mode Specifier
+/*class RoundingMode<bits<3> val> {
+  bits<3> Value = val;
+}
+
+def RndDefault     : RoundingMode<0>;
+def RndNearestEven : RoundingMode<1>;
+def RndNearestZero : RoundingMode<2>;
+def RndNegInf      : RoundingMode<3>;
+def RndPosInf      : RoundingMode<4>;
+def RndApprox      : RoundingMode<5>;*/
+
+
+// Rounding Mode Operand
+def RndMode : Operand<i32> {
+  let PrintMethod = "printRoundingMode";
+}
+
+def RndDefault : PatLeaf<(i32 0)>;
+
+// PTX Predicate operand, default to (0, 0) = (zero-reg, none).
+// Leave PrintMethod empty; predicate printing is defined elsewhere.
+def pred : PredicateOperand<OtherVT, (ops RegPred, i32imm),
+                                     (ops (i1 zero_reg), (i32 2))>;
+
+def RndModeOperand : Operand<OtherVT> {
+  let MIOperandInfo = (ops i32imm);
+}
+
+// Instruction Types
+let Namespace = "PTX" in {
+
+  class InstPTX<dag oops, dag iops, string asmstr, list<dag> pattern>
+    : Instruction {
+      dag OutOperandList = oops;
+      dag InOperandList = !con(iops, (ins pred:$_p));
+      let AsmString = asmstr; // Predicate printing is defined elsewhere.
+      let Pattern = pattern;
+      let isPredicable = 1;
+  }
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXInstrInfo.cpp b/contrib/llvm/lib/Target/PTX/PTXInstrInfo.cpp
new file mode 100644
index 0000000..443cd54
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXInstrInfo.cpp
@@ -0,0 +1,359 @@
+//===-- PTXInstrInfo.cpp - PTX Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-instrinfo"
+
+#include "PTXInstrInfo.h"
+#include "PTX.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define GET_INSTRINFO_CTOR
+#include "PTXGenInstrInfo.inc"
+
+using namespace llvm;
+
+PTXInstrInfo::PTXInstrInfo(PTXTargetMachine &_TM)
+  : PTXGenInstrInfo(),
+    RI(_TM, *this), TM(_TM) {}
+
+static const struct map_entry {
+  const TargetRegisterClass *cls;
+  const int opcode;
+} map[] = {
+  { &PTX::RegI16RegClass, PTX::MOVU16rr },
+  { &PTX::RegI32RegClass, PTX::MOVU32rr },
+  { &PTX::RegI64RegClass, PTX::MOVU64rr },
+  { &PTX::RegF32RegClass, PTX::MOVF32rr },
+  { &PTX::RegF64RegClass, PTX::MOVF64rr },
+  { &PTX::RegPredRegClass,   PTX::MOVPREDrr }
+};
+
+void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, DebugLoc DL,
+                               unsigned DstReg, unsigned SrcReg,
+                               bool KillSrc) const {
+
+  const MachineRegisterInfo& MRI = MBB.getParent()->getRegInfo();
+  //assert(MRI.getRegClass(SrcReg) == MRI.getRegClass(DstReg) &&
+  //  "Invalid register copy between two register classes");
+
+  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++i) {
+    if (map[i].cls == MRI.getRegClass(DstReg)) {
+      const MCInstrDesc &MCID = get(map[i].opcode);
+      MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg).
+        addReg(SrcReg, getKillRegState(KillSrc));
+      AddDefaultPredicate(MI);
+      return;
+    }
+  }
+
+  llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+bool PTXInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                unsigned DstReg, unsigned SrcReg,
+                                const TargetRegisterClass *DstRC,
+                                const TargetRegisterClass *SrcRC,
+                                DebugLoc DL) const {
+  if (DstRC != SrcRC)
+    return false;
+
+  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i)
+    if (DstRC == map[i].cls) {
+      const MCInstrDesc &MCID = get(map[i].opcode);
+      MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg).addReg(SrcReg);
+      AddDefaultPredicate(MI);
+      return true;
+    }
+
+  return false;
+}
+
+bool PTXInstrInfo::isMoveInstr(const MachineInstr& MI,
+                               unsigned &SrcReg, unsigned &DstReg,
+                               unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  switch (MI.getOpcode()) {
+    default:
+      return false;
+    case PTX::MOVU16rr:
+    case PTX::MOVU32rr:
+    case PTX::MOVU64rr:
+    case PTX::MOVF32rr:
+    case PTX::MOVF64rr:
+    case PTX::MOVPREDrr:
+      assert(MI.getNumOperands() >= 2 &&
+             MI.getOperand(0).isReg() && MI.getOperand(1).isReg() &&
+             "Invalid register-register move instruction");
+      SrcSubIdx = DstSubIdx = 0; // No sub-registers
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+  }
+}
+
+// predicate support
+
+bool PTXInstrInfo::isPredicated(const MachineInstr *MI) const {
+  int i = MI->findFirstPredOperandIdx();
+  return i != -1 && MI->getOperand(i).getReg() != PTX::NoRegister;
+}
+
+bool PTXInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  return !isPredicated(MI) && MI->isTerminator();
+}
+
+bool PTXInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  if (Pred.size() < 2)
+    llvm_unreachable("lesser than 2 predicate operands are provided");
+
+  int i = MI->findFirstPredOperandIdx();
+  if (i == -1)
+    llvm_unreachable("missing predicate operand");
+
+  MI->getOperand(i).setReg(Pred[0].getReg());
+  MI->getOperand(i+1).setImm(Pred[1].getImm());
+
+  return true;
+}
+
+bool PTXInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  const MachineOperand &PredReg1 = Pred1[0];
+  const MachineOperand &PredReg2 = Pred2[0];
+  if (PredReg1.getReg() != PredReg2.getReg())
+    return false;
+
+  const MachineOperand &PredOp1 = Pred1[1];
+  const MachineOperand &PredOp2 = Pred2[1];
+  if (PredOp1.getImm() != PredOp2.getImm())
+    return false;
+
+  return true;
+}
+
+bool PTXInstrInfo::
+DefinesPredicate(MachineInstr *MI,
+                 std::vector<MachineOperand> &Pred) const {
+  // If an instruction sets a predicate register, it defines a predicate.
+
+  // TODO supprot 5-operand format of setp instruction
+
+  if (MI->getNumOperands() < 1)
+    return false;
+
+  const MachineOperand &MO = MI->getOperand(0);
+
+  if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::RegPredRegClass)
+    return false;
+
+  Pred.push_back(MO);
+  Pred.push_back(MachineOperand::CreateImm(PTXPredicate::None));
+  return true;
+}
+
+// branch support
+
+bool PTXInstrInfo::
+AnalyzeBranch(MachineBasicBlock &MBB,
+              MachineBasicBlock *&TBB,
+              MachineBasicBlock *&FBB,
+              SmallVectorImpl<MachineOperand> &Cond,
+              bool AllowModify) const {
+  // TODO implement cases when AllowModify is true
+
+  if (MBB.empty())
+    return true;
+
+  MachineBasicBlock::iterator iter = MBB.end();
+  const MachineInstr& instLast1 = *--iter;
+  // for special case that MBB has only 1 instruction
+  const bool IsSizeOne = MBB.size() == 1;
+  // if IsSizeOne is true, *--iter and instLast2 are invalid
+  // we put a dummy value in instLast2 and desc2 since they are used
+  const MachineInstr& instLast2 = IsSizeOne ? instLast1 : *--iter;
+
+  DEBUG(dbgs() << "\n");
+  DEBUG(dbgs() << "AnalyzeBranch: opcode: " << instLast1.getOpcode() << "\n");
+  DEBUG(dbgs() << "AnalyzeBranch: MBB:    " << MBB.getName().str() << "\n");
+  DEBUG(dbgs() << "AnalyzeBranch: TBB:    " << TBB << "\n");
+  DEBUG(dbgs() << "AnalyzeBranch: FBB:    " << FBB << "\n");
+
+  // this block ends with no branches
+  if (!IsAnyKindOfBranch(instLast1)) {
+    DEBUG(dbgs() << "AnalyzeBranch: ends with no branch\n");
+    return false;
+  }
+
+  // this block ends with only an unconditional branch
+  if (instLast1.isUnconditionalBranch() &&
+      // when IsSizeOne is true, it "absorbs" the evaluation of instLast2
+      (IsSizeOne || !IsAnyKindOfBranch(instLast2))) {
+    DEBUG(dbgs() << "AnalyzeBranch: ends with only uncond branch\n");
+    TBB = GetBranchTarget(instLast1);
+    return false;
+  }
+
+  // this block ends with a conditional branch and
+  // it falls through to a successor block
+  if (instLast1.isConditionalBranch() &&
+      IsAnySuccessorAlsoLayoutSuccessor(MBB)) {
+    DEBUG(dbgs() << "AnalyzeBranch: ends with cond branch and fall through\n");
+    TBB = GetBranchTarget(instLast1);
+    int i = instLast1.findFirstPredOperandIdx();
+    Cond.push_back(instLast1.getOperand(i));
+    Cond.push_back(instLast1.getOperand(i+1));
+    return false;
+  }
+
+  // when IsSizeOne is true, we are done
+  if (IsSizeOne)
+    return true;
+
+  // this block ends with a conditional branch
+  // followed by an unconditional branch
+  if (instLast2.isConditionalBranch() &&
+      instLast1.isUnconditionalBranch()) {
+    DEBUG(dbgs() << "AnalyzeBranch: ends with cond and uncond branch\n");
+    TBB = GetBranchTarget(instLast2);
+    FBB = GetBranchTarget(instLast1);
+    int i = instLast2.findFirstPredOperandIdx();
+    Cond.push_back(instLast2.getOperand(i));
+    Cond.push_back(instLast2.getOperand(i+1));
+    return false;
+  }
+
+  // branch cannot be understood
+  DEBUG(dbgs() << "AnalyzeBranch: cannot be understood\n");
+  return true;
+}
+
+unsigned PTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  unsigned count = 0;
+  while (!MBB.empty())
+    if (IsAnyKindOfBranch(MBB.back())) {
+      MBB.pop_back();
+      ++count;
+    } else
+      break;
+  DEBUG(dbgs() << "RemoveBranch: MBB:   " << MBB.getName().str() << "\n");
+  DEBUG(dbgs() << "RemoveBranch: remove " << count << " branch inst\n");
+  return count;
+}
+
+unsigned PTXInstrInfo::
+InsertBranch(MachineBasicBlock &MBB,
+             MachineBasicBlock *TBB,
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond,
+             DebugLoc DL) const {
+  DEBUG(dbgs() << "InsertBranch: MBB: " << MBB.getName().str() << "\n");
+  DEBUG(if (TBB) dbgs() << "InsertBranch: TBB: " << TBB->getName().str()
+                        << "\n";
+        else     dbgs() << "InsertBranch: TBB: (NULL)\n");
+  DEBUG(if (FBB) dbgs() << "InsertBranch: FBB: " << FBB->getName().str()
+                        << "\n";
+        else     dbgs() << "InsertBranch: FBB: (NULL)\n");
+  DEBUG(dbgs() << "InsertBranch: Cond size: " << Cond.size() << "\n");
+
+  assert(TBB && "TBB is NULL");
+
+  if (FBB) {
+    BuildMI(&MBB, DL, get(PTX::BRAdp))
+      .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
+    BuildMI(&MBB, DL, get(PTX::BRAd))
+      .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
+    return 2;
+  } else if (Cond.size()) {
+    BuildMI(&MBB, DL, get(PTX::BRAdp))
+      .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
+    return 1;
+  } else {
+    BuildMI(&MBB, DL, get(PTX::BRAd))
+      .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
+    return 1;
+  }
+}
+
+// Memory operand folding for spills
+void PTXInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MII,
+                                     unsigned SrcReg, bool isKill, int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  llvm_unreachable("storeRegToStackSlot should not be called for PTX");
+}
+
+void PTXInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MII,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  llvm_unreachable("loadRegFromStackSlot should not be called for PTX");
+}
+
+// static helper routines
+
+MachineSDNode *PTXInstrInfo::
+GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
+                  DebugLoc dl, EVT VT, SDValue Op1) {
+  SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
+  SDValue ops[] = { Op1, predReg, predOp };
+  return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
+}
+
+MachineSDNode *PTXInstrInfo::
+GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
+                  DebugLoc dl, EVT VT, SDValue Op1, SDValue Op2) {
+  SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
+  SDValue ops[] = { Op1, Op2, predReg, predOp };
+  return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
+}
+
+void PTXInstrInfo::AddDefaultPredicate(MachineInstr *MI) {
+  if (MI->findFirstPredOperandIdx() == -1) {
+    MI->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false));
+    MI->addOperand(MachineOperand::CreateImm(PTXPredicate::None));
+  }
+}
+
+bool PTXInstrInfo::IsAnyKindOfBranch(const MachineInstr& inst) {
+  return inst.isTerminator() || inst.isBranch() || inst.isIndirectBranch();
+}
+
+bool PTXInstrInfo::
+IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB) {
+  for (MachineBasicBlock::const_succ_iterator
+      i = MBB.succ_begin(), e = MBB.succ_end(); i != e; ++i)
+    if (MBB.isLayoutSuccessor((const MachineBasicBlock*) &*i))
+      return true;
+  return false;
+}
+
+MachineBasicBlock *PTXInstrInfo::GetBranchTarget(const MachineInstr& inst) {
+  // FIXME So far all branch instructions put destination in 1st operand
+  const MachineOperand& target = inst.getOperand(0);
+  assert(target.isMBB() && "FIXME: detect branch target operand");
+  return target.getMBB();
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXInstrInfo.h b/contrib/llvm/lib/Target/PTX/PTXInstrInfo.h
new file mode 100644
index 0000000..fba89c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXInstrInfo.h
@@ -0,0 +1,133 @@
+//===-- PTXInstrInfo.h - PTX Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_INSTR_INFO_H
+#define PTX_INSTR_INFO_H
+
+#include "PTXRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "PTXGenInstrInfo.inc"
+
+namespace llvm {
+class PTXTargetMachine;
+
+class MachineSDNode;
+class SDValue;
+class SelectionDAG;
+
+class PTXInstrInfo : public PTXGenInstrInfo {
+private:
+  const PTXRegisterInfo RI;
+  PTXTargetMachine &TM;
+
+public:
+  explicit PTXInstrInfo(PTXTargetMachine &_TM);
+
+  virtual const PTXRegisterInfo &getRegisterInfo() const { return RI; }
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DstReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DstReg, unsigned SrcReg,
+                            const TargetRegisterClass *DstRC,
+                            const TargetRegisterClass *SrcRC,
+                            DebugLoc DL) const;
+
+  virtual bool isMoveInstr(const MachineInstr& MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  // predicate support
+
+  virtual bool isPredicated(const MachineInstr *MI) const;
+
+  virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+
+  virtual
+  bool PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const;
+
+  virtual
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  virtual bool DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const;
+
+  // PTX is fully-predicable
+  virtual bool isPredicable(MachineInstr *MI) const { return true; }
+
+  // branch support
+
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify = false) const;
+
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+
+  // Memory operand folding for spills
+  // TODO: Implement this eventually and get rid of storeRegToStackSlot and
+  //       loadRegFromStackSlot.  Doing so will get rid of the "stack" registers
+  //       we currently use to spill, though I doubt the overall effect on ptxas
+  //       output will be large.  I have yet to see a case where ptxas is unable
+  //       to see through the "stack" register usage and hence generates
+  //       efficient code anyway.
+  // virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+  //                                             MachineInstr* MI,
+  //                                       const SmallVectorImpl<unsigned> &Ops,
+  //                                             int FrameIndex) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock& MBB,
+                                   MachineBasicBlock::iterator MII,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass* RC,
+                                   const TargetRegisterInfo* TRI) const;
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MII,
+                                    unsigned DestReg, int FrameIdx,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  // static helper routines
+
+  static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
+                                          DebugLoc dl, EVT VT,
+                                          SDValue Op1);
+
+  static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
+                                          DebugLoc dl, EVT VT,
+                                          SDValue Op1, SDValue Op2);
+
+  static void AddDefaultPredicate(MachineInstr *MI);
+
+  static bool IsAnyKindOfBranch(const MachineInstr& inst);
+
+  static bool IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB);
+
+  static MachineBasicBlock *GetBranchTarget(const MachineInstr& inst);
+}; // class PTXInstrInfo
+} // namespace llvm
+
+#endif // PTX_INSTR_INFO_H
diff --git a/contrib/llvm/lib/Target/PTX/PTXInstrInfo.td b/contrib/llvm/lib/Target/PTX/PTXInstrInfo.td
new file mode 100644
index 0000000..bead428
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXInstrInfo.td
@@ -0,0 +1,1031 @@
+//===-- PTXInstrInfo.td - PTX Instruction defs --------------*- tablegen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "PTXInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Code Generation Predicates
+//===----------------------------------------------------------------------===//
+
+// Shader Model Support
+def FDivNeedsRoundingMode : Predicate<"getSubtarget().fdivNeedsRoundingMode()">;
+def FDivNoRoundingMode : Predicate<"!getSubtarget().fdivNeedsRoundingMode()">;
+def FMadNeedsRoundingMode : Predicate<"getSubtarget().fmadNeedsRoundingMode()">;
+def FMadNoRoundingMode : Predicate<"!getSubtarget().fmadNeedsRoundingMode()">;
+
+// PTX Version Support
+def SupportsPTX21       : Predicate<"getSubtarget().supportsPTX21()">;
+def DoesNotSupportPTX21 : Predicate<"!getSubtarget().supportsPTX21()">;
+def SupportsPTX22       : Predicate<"getSubtarget().supportsPTX22()">;
+def DoesNotSupportPTX22 : Predicate<"!getSubtarget().supportsPTX22()">;
+def SupportsPTX23       : Predicate<"getSubtarget().supportsPTX23()">;
+def DoesNotSupportPTX23 : Predicate<"!getSubtarget().supportsPTX23()">;
+
+// Fused-Multiply Add
+def SupportsFMA         : Predicate<"getSubtarget().supportsFMA()">;
+def DoesNotSupportFMA   : Predicate<"!getSubtarget().supportsFMA()">;
+
+
+
+// def SDT_PTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+// def SDT_PTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+// def PTXcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PTXCallSeqStart,
+//                               [SDNPHasChain, SDNPOutGlue]>;
+// def PTXcallseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_PTXCallSeqEnd,
+//                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def PTXcall : SDNode<"PTXISD::CALL", SDTNone,
+                     [SDNPHasChain, SDNPVariadic, SDNPOptInGlue, SDNPOutGlue]>;
+
+
+// Branch & call targets have OtherVT type.
+def brtarget   : Operand<OtherVT>;
+def calltarget : Operand<i32>;
+
+//===----------------------------------------------------------------------===//
+// PTX Specific Node Definitions
+//===----------------------------------------------------------------------===//
+
+// PTX allow generic 3-reg shifts like shl r0, r1, r2
+def PTXshl : SDNode<"ISD::SHL", SDTIntBinOp>;
+def PTXsrl : SDNode<"ISD::SRL", SDTIntBinOp>;
+def PTXsra : SDNode<"ISD::SRA", SDTIntBinOp>;
+
+def PTXexit
+  : SDNode<"PTXISD::EXIT", SDTNone, [SDNPHasChain]>;
+def PTXret
+  : SDNode<"PTXISD::RET",  SDTNone,
+           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def PTXcopyaddress
+  : SDNode<"PTXISD::COPY_ADDRESS", SDTypeProfile<1, 1, []>, []>;
+
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// For floating-point instructions, we cannot just embed the pattern into the
+// instruction definition since we need to muck around with the rounding mode,
+// and I do not know how to insert constants into instructions directly from
+// pattern matches.
+
+//===- Floating-Point Instructions - 2 Operand Form -----------------------===//
+multiclass PTX_FLOAT_2OP<string opcstr> {
+  def rr32 : InstPTX<(outs RegF32:$d),
+                     (ins RndMode:$r, RegF32:$a),
+                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
+  def ri32 : InstPTX<(outs RegF32:$d),
+                     (ins RndMode:$r, f32imm:$a),
+                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
+  def rr64 : InstPTX<(outs RegF64:$d),
+                     (ins RndMode:$r, RegF64:$a),
+                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
+  def ri64 : InstPTX<(outs RegF64:$d),
+                     (ins RndMode:$r, f64imm:$a),
+                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
+}
+
+//===- Floating-Point Instructions - 3 Operand Form -----------------------===//
+multiclass PTX_FLOAT_3OP<string opcstr> {
+  def rr32 : InstPTX<(outs RegF32:$d),
+                     (ins RndMode:$r, RegF32:$a, RegF32:$b),
+                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
+  def ri32 : InstPTX<(outs RegF32:$d),
+                     (ins RndMode:$r, RegF32:$a, f32imm:$b),
+                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
+  def rr64 : InstPTX<(outs RegF64:$d),
+                     (ins RndMode:$r, RegF64:$a, RegF64:$b),
+                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
+  def ri64 : InstPTX<(outs RegF64:$d),
+                     (ins RndMode:$r, RegF64:$a, f64imm:$b),
+                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
+}
+
+//===- Floating-Point Instructions - 4 Operand Form -----------------------===//
+multiclass PTX_FLOAT_4OP<string opcstr> {
+  def rrr32 : InstPTX<(outs RegF32:$d),
+                      (ins RndMode:$r, RegF32:$a, RegF32:$b, RegF32:$c),
+                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
+  def rri32 : InstPTX<(outs RegF32:$d),
+                      (ins RndMode:$r, RegF32:$a, RegF32:$b, f32imm:$c),
+                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
+  def rii32 : InstPTX<(outs RegF32:$d),
+                      (ins RndMode:$r, RegF32:$a, f32imm:$b, f32imm:$c),
+                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
+  def rrr64 : InstPTX<(outs RegF64:$d),
+                      (ins RndMode:$r, RegF64:$a, RegF64:$b, RegF64:$c),
+                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
+  def rri64 : InstPTX<(outs RegF64:$d),
+                      (ins RndMode:$r, RegF64:$a, RegF64:$b, f64imm:$c),
+                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
+  def rii64 : InstPTX<(outs RegF64:$d),
+                      (ins RndMode:$r, RegF64:$a, f64imm:$b, f64imm:$c),
+                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
+}
+
+//===- Integer Instructions - 3 Operand Form ------------------------------===//
+multiclass PTX_INT3<string opcstr, SDNode opnode> {
+  def rr16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, RegI16:$b),
+                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
+  def ri16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, i16imm:$b),
+                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, RegI32:$b),
+                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
+  def ri32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, i32imm:$b),
+                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, RegI64:$b),
+                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
+  def ri64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, i64imm:$b),
+                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
+}
+
+//===- Integer Instructions - 3 Operand Form (Signed) ---------------------===//
+multiclass PTX_INT3_SIGNED<string opcstr, SDNode opnode> {
+  def rr16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, RegI16:$b),
+                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
+  def ri16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, i16imm:$b),
+                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, RegI32:$b),
+                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
+  def ri32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, i32imm:$b),
+                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, RegI64:$b),
+                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
+  def ri64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, i64imm:$b),
+                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
+}
+
+//===- Bitwise Logic Instructions - 3 Operand Form ------------------------===//
+multiclass PTX_LOGIC<string opcstr, SDNode opnode> {
+  def ripreds : InstPTX<(outs RegPred:$d),
+                     (ins RegPred:$a, i1imm:$b),
+                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
+                     [(set RegPred:$d, (opnode RegPred:$a, imm:$b))]>;
+  def rrpreds : InstPTX<(outs RegPred:$d),
+                     (ins RegPred:$a, RegPred:$b),
+                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
+                     [(set RegPred:$d, (opnode RegPred:$a, RegPred:$b))]>;
+  def rr16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, RegI16:$b),
+                     !strconcat(opcstr, ".b16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
+  def ri16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, i16imm:$b),
+                     !strconcat(opcstr, ".b16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, RegI32:$b),
+                     !strconcat(opcstr, ".b32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
+  def ri32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, i32imm:$b),
+                     !strconcat(opcstr, ".b32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, RegI64:$b),
+                     !strconcat(opcstr, ".b64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
+  def ri64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, i64imm:$b),
+                     !strconcat(opcstr, ".b64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
+}
+
+//===- Integer Shift Instructions - 3 Operand Form ------------------------===//
+multiclass PTX_INT3ntnc<string opcstr, SDNode opnode> {
+  def rr16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, RegI16:$b),
+                     !strconcat(opcstr, "16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
+  def rr32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, RegI32:$b),
+                     !strconcat(opcstr, "32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
+  def rr64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, RegI64:$b),
+                     !strconcat(opcstr, "64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
+  def ri16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, i16imm:$b),
+                     !strconcat(opcstr, "16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
+  def ri32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, i32imm:$b),
+                     !strconcat(opcstr, "32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
+  def ri64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, i64imm:$b),
+                     !strconcat(opcstr, "64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
+  def ir16 : InstPTX<(outs RegI16:$d),
+                     (ins i16imm:$a, RegI16:$b),
+                     !strconcat(opcstr, "16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode imm:$a, RegI16:$b))]>;
+  def ir32 : InstPTX<(outs RegI32:$d),
+                     (ins i32imm:$a, RegI32:$b),
+                     !strconcat(opcstr, "32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode imm:$a, RegI32:$b))]>;
+  def ir64 : InstPTX<(outs RegI64:$d),
+                     (ins i64imm:$a, RegI64:$b),
+                     !strconcat(opcstr, "64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode imm:$a, RegI64:$b))]>;
+}
+
+//===- Set Predicate Instructions (Int) - 3/4 Operand Forms ---------------===//
+multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls,
+                        CondCode cmp, string cmpstr> {
+  // TODO support 5-operand format: p|q, a, b, c
+
+  def rr
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, RC:$b, cmp))]>;
+  def ri
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, imm:$b, cmp))]>;
+
+  def rr_and_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
+  def ri_and_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
+                                     RegPred:$c))]>;
+  def rr_or_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
+  def ri_or_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
+  def rr_xor_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
+  def ri_xor_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
+                                     RegPred:$c))]>;
+
+  def rr_and_not_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp),
+                                     (not RegPred:$c)))]>;
+  def ri_and_not_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
+                                     (not RegPred:$c)))]>;
+  def rr_or_not_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp),
+                                    (not RegPred:$c)))]>;
+  def ri_or_not_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp),
+                                    (not RegPred:$c)))]>;
+  def rr_xor_not_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp),
+                                     (not RegPred:$c)))]>;
+  def ri_xor_not_r
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
+                                     (not RegPred:$c)))]>;
+}
+
+//===- Set Predicate Instructions (FP) - 3/4 Operand Form -----------------===//
+multiclass PTX_SETP_FP<RegisterClass RC, string regclsname, Operand immcls,
+                        CondCode ucmp, CondCode ocmp, string cmpstr> {
+  // TODO support 5-operand format: p|q, a, b, c
+
+  def rr_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
+              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, RC:$b, ucmp))]>;
+  def rr_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, RC:$b, ocmp))]>;
+
+  def ri_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
+              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ucmp))]>;
+  def ri_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ocmp))]>;
+
+  def rr_and_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
+                                     RegPred:$c))]>;
+  def rr_and_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
+                                     RegPred:$c))]>;
+
+  def rr_or_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.or.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
+  def rr_or_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
+
+  def rr_xor_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
+                                     RegPred:$c))]>;
+  def rr_xor_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
+                                     RegPred:$c))]>;
+
+  def rr_and_not_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
+                                     (not RegPred:$c)))]>;
+  def rr_and_not_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
+                                     (not RegPred:$c)))]>;
+
+  def rr_or_not_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp),
+                                    (not RegPred:$c)))]>;
+  def rr_or_not_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp),
+                                    (not RegPred:$c)))]>;
+
+  def rr_xor_not_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
+                                     (not RegPred:$c)))]>;
+  def rr_xor_not_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
+                                     (not RegPred:$c)))]>;
+}
+
+//===- Select Predicate Instructions - 4 Operand Form ---------------------===//
+multiclass PTX_SELP<RegisterClass RC, string regclsname, Operand immcls,
+                    SDNode immnode> {
+  def rr
+    : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, RC:$c),
+              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
+              [(set RC:$r, (select RegPred:$a, RC:$b, RC:$c))]>;
+  def ri
+    : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, immcls:$c),
+              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
+              [(set RC:$r, (select RegPred:$a, RC:$b, immnode:$c))]>;
+  def ii
+    : InstPTX<(outs RC:$r), (ins RegPred:$a, immcls:$b, immcls:$c),
+              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
+              [(set RC:$r, (select RegPred:$a, immnode:$b, immnode:$c))]>;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+///===- Integer Arithmetic Instructions -----------------------------------===//
+
+defm ADD  : PTX_INT3<"add", add>;
+defm SUB  : PTX_INT3<"sub", sub>;
+defm MUL  : PTX_INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies
+defm DIV  : PTX_INT3<"div", udiv>;
+defm SDIV : PTX_INT3_SIGNED<"div", sdiv>;
+defm REM  : PTX_INT3<"rem", urem>;
+
+///===- Floating-Point Arithmetic Instructions ----------------------------===//
+
+// FNEG
+defm FNEG : PTX_FLOAT_2OP<"neg">;
+
+// Standard Binary Operations
+defm FADD : PTX_FLOAT_3OP<"add">;
+defm FSUB : PTX_FLOAT_3OP<"sub">;
+defm FMUL : PTX_FLOAT_3OP<"mul">;
+defm FDIV : PTX_FLOAT_3OP<"div">;
+
+// Multi-operation hybrid instructions
+defm FMAD : PTX_FLOAT_4OP<"mad">, Requires<[SupportsFMA]>;
+
+
+///===- Floating-Point Intrinsic Instructions -----------------------------===//
+
+// SQRT
+def FSQRTrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
+                        "sqrt$r.f32\t$d, $a", []>;
+def FSQRTri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
+                        "sqrt$r.f32\t$d, $a", []>;
+def FSQRTrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
+                        "sqrt$r.f64\t$d, $a", []>;
+def FSQRTri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
+                        "sqrt$r.f64\t$d, $a", []>;
+
+// SIN
+def FSINrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
+                       "sin$r.f32\t$d, $a", []>;
+def FSINri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
+                       "sin$r.f32\t$d, $a", []>;
+def FSINrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
+                       "sin$r.f64\t$d, $a", []>;
+def FSINri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
+                       "sin$r.f64\t$d, $a", []>;
+
+// COS
+def FCOSrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
+                       "cos$r.f32\t$d, $a", []>;
+def FCOSri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
+                       "cos$r.f32\t$d, $a", []>;
+def FCOSrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
+                       "cos$r.f64\t$d, $a", []>;
+def FCOSri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
+                       "cos$r.f64\t$d, $a", []>;
+
+
+
+
+///===- Comparison and Selection Instructions -----------------------------===//
+
+// .setp
+
+// Compare u16
+
+defm SETPEQu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETEQ,  "eq">;
+defm SETPNEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETNE,  "ne">;
+defm SETPLTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULT, "lt">;
+defm SETPLEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULE, "le">;
+defm SETPGTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGT, "gt">;
+defm SETPGEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGE, "ge">;
+defm SETPLTs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETLT,  "lt">;
+defm SETPLEs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETLE,  "le">;
+defm SETPGTs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETGT,  "gt">;
+defm SETPGEs16 : PTX_SETP_I<RegI16, "s16", i16imm, SETGE,  "ge">;
+
+// Compare u32
+
+defm SETPEQu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETEQ,  "eq">;
+defm SETPNEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETNE,  "ne">;
+defm SETPLTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULT, "lt">;
+defm SETPLEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULE, "le">;
+defm SETPGTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGT, "gt">;
+defm SETPGEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGE, "ge">;
+defm SETPLTs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETLT,  "lt">;
+defm SETPLEs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETLE,  "le">;
+defm SETPGTs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETGT,  "gt">;
+defm SETPGEs32 : PTX_SETP_I<RegI32, "s32", i32imm, SETGE,  "ge">;
+
+// Compare u64
+
+defm SETPEQu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETEQ,  "eq">;
+defm SETPNEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETNE,  "ne">;
+defm SETPLTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULT, "lt">;
+defm SETPLEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULE, "le">;
+defm SETPGTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGT, "gt">;
+defm SETPGEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGE, "ge">;
+defm SETPLTs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETLT,  "lt">;
+defm SETPLEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETLE,  "le">;
+defm SETPGTs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGT,  "gt">;
+defm SETPGEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGE,  "ge">;
+
+// Compare f32
+
+defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUEQ, SETOEQ, "eq">;
+defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUNE, SETONE, "ne">;
+defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULT, SETOLT, "lt">;
+defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULE, SETOLE, "le">;
+defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGT, SETOGT, "gt">;
+defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGE, SETOGE, "ge">;
+
+// Compare f64
+
+defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUEQ, SETOEQ, "eq">;
+defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUNE, SETONE, "ne">;
+defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULT, SETOLT, "lt">;
+defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULE, SETOLE, "le">;
+defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGT, SETOGT, "gt">;
+defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGE, SETOGE, "ge">;
+
+// .selp
+
+defm SELPi16 : PTX_SELP<RegI16, "u16", i16imm, imm>;
+defm SELPi32 : PTX_SELP<RegI32, "u32", i32imm, imm>;
+defm SELPi64 : PTX_SELP<RegI64, "u64", i64imm, imm>;
+defm SELPf32 : PTX_SELP<RegF32, "f32", f32imm, fpimm>;
+defm SELPf64 : PTX_SELP<RegF64, "f64", f64imm, fpimm>;
+
+///===- Logic and Shift Instructions --------------------------------------===//
+
+defm SHL : PTX_INT3ntnc<"shl.b", PTXshl>;
+defm SRL : PTX_INT3ntnc<"shr.u", PTXsrl>;
+defm SRA : PTX_INT3ntnc<"shr.s", PTXsra>;
+
+defm AND : PTX_LOGIC<"and", and>;
+defm OR  : PTX_LOGIC<"or",  or>;
+defm XOR : PTX_LOGIC<"xor", xor>;
+
+///===- Data Movement and Conversion Instructions -------------------------===//
+
+// any_extend
+// Implement the anyext instruction in terms of the PTX cvt instructions.
+//def : Pat<(i32 (anyext RegI16:$a)), (CVT_u32_u16 RegI16:$a)>;
+//def : Pat<(i64 (anyext RegI16:$a)), (CVT_u64_u16 RegI16:$a)>;
+//def : Pat<(i64 (anyext RegI32:$a)), (CVT_u64_u32 RegI32:$a)>;
+
+// bitconvert
+// These instructions implement the bit-wise conversion between integer and
+// floating-point types.
+def MOVi32f32
+  : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "mov.b32\t$d, $a", []>;
+def MOVf32i32
+  : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "mov.b32\t$d, $a", []>;
+def MOVi64f64
+  : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "mov.b64\t$d, $a", []>;
+def MOVf64i64
+  : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "mov.b64\t$d, $a", []>;
+
+let neverHasSideEffects = 1 in {
+  def MOVPREDrr
+    : InstPTX<(outs RegPred:$d), (ins RegPred:$a), "mov.pred\t$d, $a", []>;
+  def MOVU16rr
+    : InstPTX<(outs RegI16:$d), (ins RegI16:$a), "mov.u16\t$d, $a", []>;
+  def MOVU32rr
+    : InstPTX<(outs RegI32:$d), (ins RegI32:$a), "mov.u32\t$d, $a", []>;
+  def MOVU64rr
+    : InstPTX<(outs RegI64:$d), (ins RegI64:$a), "mov.u64\t$d, $a", []>;
+  def MOVF32rr
+    : InstPTX<(outs RegF32:$d), (ins RegF32:$a), "mov.f32\t$d, $a", []>;
+  def MOVF64rr
+    : InstPTX<(outs RegF64:$d), (ins RegF64:$a), "mov.f64\t$d, $a", []>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  def MOVPREDri
+    : InstPTX<(outs RegPred:$d), (ins i1imm:$a), "mov.pred\t$d, $a",
+              [(set RegPred:$d, imm:$a)]>;
+  def MOVU16ri
+    : InstPTX<(outs RegI16:$d), (ins i16imm:$a), "mov.u16\t$d, $a",
+              [(set RegI16:$d, imm:$a)]>;
+  def MOVU32ri
+    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
+              [(set RegI32:$d, imm:$a)]>;
+  def MOVU64ri
+    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
+              [(set RegI64:$d, imm:$a)]>;
+  def MOVF32ri
+    : InstPTX<(outs RegF32:$d), (ins f32imm:$a), "mov.f32\t$d, $a",
+              [(set RegF32:$d, fpimm:$a)]>;
+  def MOVF64ri
+    : InstPTX<(outs RegF64:$d), (ins f64imm:$a), "mov.f64\t$d, $a",
+              [(set RegF64:$d, fpimm:$a)]>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  def MOVaddr32
+    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
+              [(set RegI32:$d, (PTXcopyaddress tglobaladdr:$a))]>;
+  def MOVaddr64
+    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
+              [(set RegI64:$d, (PTXcopyaddress tglobaladdr:$a))]>;
+  def MOVframe32
+    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "cvta.local.u32\t$d, $a",
+              [(set RegI32:$d, (PTXcopyaddress frameindex:$a))]>;
+  def MOVframe64
+    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "cvta.local.u64\t$d, $a",
+              [(set RegI64:$d, (PTXcopyaddress frameindex:$a))]>;
+}
+
+// PTX cvt instructions
+// Note all of these may actually be used, we just define all possible patterns
+// here (that make sense).
+// FIXME: Can we collapse this somehow into a multiclass def?
+
+// To i16
+def CVTu16u32
+  : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a", []>;
+def CVTu16u64
+  : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a", []>;
+def CVTu16f32
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.u16.f32\t$d, $a", []>;
+def CVTs16f32
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.s16.f32\t$d, $a", []>;
+def CVTu16f64
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.u16.f64\t$d, $a", []>;
+def CVTs16f64
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.s16.f64\t$d, $a", []>;
+
+// To i32
+def CVTu32u16
+  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a", []>;
+def CVTs32s16
+  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.s32.s16\t$d, $a", []>;
+def CVTu32u64
+  : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a", []>;
+def CVTu32f32
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.u32.f32\t$d, $a", []>;
+def CVTs32f32
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.s32.f32\t$d, $a", []>;
+def CVTu32f64
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.u32.f64\t$d, $a", []>;
+def CVTs32f64
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.s32.f64\t$d, $a", []>;
+
+// To i64
+def CVTu64u16
+  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a", []>;
+def CVTs64s16
+  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.s64.s16\t$d, $a", []>;
+def CVTu64u32
+  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a", []>;
+def CVTs64s32
+  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.s64.s32\t$d, $a", []>;
+def CVTu64f32
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.u64.f32\t$d, $a", []>;
+def CVTs64f32
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.s64.f32\t$d, $a", []>;
+def CVTu64f64
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.u64.f64\t$d, $a", []>;
+def CVTs64f64
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.s64.f64\t$d, $a", []>;
+
+// To f32
+def CVTf32u16
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f32.u16\t$d, $a", []>;
+def CVTf32s16
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f32.s16\t$d, $a", []>;
+def CVTf32u32
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f32.u32\t$d, $a", []>;
+def CVTf32s32
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f32.s32\t$d, $a", []>;
+def CVTf32u64
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f32.u64\t$d, $a", []>;
+def CVTf32s64
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f32.s64\t$d, $a", []>;
+def CVTf32f64
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.f32.f64\t$d, $a", []>;
+
+// To f64
+def CVTf64u16
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f64.u16\t$d, $a", []>;
+def CVTf64s16
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f64.s16\t$d, $a", []>;
+def CVTf64u32
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f64.u32\t$d, $a", []>;
+def CVTf64s32
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f64.s32\t$d, $a", []>;
+def CVTf64u64
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f64.u64\t$d, $a", []>;
+def CVTf64s64
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f64.s64\t$d, $a", []>;
+def CVTf64f32
+  : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a", []>;
+
+  ///===- Control Flow Instructions -----------------------------------------===//
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  def BRAd
+    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  // FIXME: The pattern part is blank because I cannot (or do not yet know
+  // how to) use the first operand of PredicateOperand (a RegPred register) here
+  // When this is revisited, make sure to also look at LowerSETCC and try to
+  // fold it into negated predicates, if possible.
+  def BRAdp
+    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
+              [/*(brcond pred:$_p, bb:$d)*/]>;
+}
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>;
+  def RET  : InstPTX<(outs), (ins), "ret",  [(PTXret)]>;
+}
+
+let hasSideEffects = 1 in {
+  def CALL : InstPTX<(outs), (ins), "call", [(PTXcall)]>;
+}
+
+///===- Parameter Passing Pseudo-Instructions -----------------------------===//
+
+def READPARAMPRED : InstPTX<(outs RegPred:$a), (ins i32imm:$b),
+                            "mov.pred\t$a, %arg$b", []>;
+def READPARAMI16  : InstPTX<(outs RegI16:$a), (ins i32imm:$b),
+                            "mov.b16\t$a, %arg$b", []>;
+def READPARAMI32  : InstPTX<(outs RegI32:$a), (ins i32imm:$b),
+                            "mov.b32\t$a, %arg$b", []>;
+def READPARAMI64  : InstPTX<(outs RegI64:$a), (ins i32imm:$b),
+                            "mov.b64\t$a, %arg$b", []>;
+def READPARAMF32  : InstPTX<(outs RegF32:$a), (ins i32imm:$b),
+                            "mov.f32\t$a, %arg$b", []>;
+def READPARAMF64  : InstPTX<(outs RegF64:$a), (ins i32imm:$b),
+                            "mov.f64\t$a, %arg$b", []>;
+
+def WRITEPARAMPRED : InstPTX<(outs), (ins RegPred:$a), "//w", []>;
+def WRITEPARAMI16  : InstPTX<(outs), (ins RegI16:$a), "//w", []>;
+def WRITEPARAMI32  : InstPTX<(outs), (ins RegI32:$a), "//w", []>;
+def WRITEPARAMI64  : InstPTX<(outs), (ins RegI64:$a), "//w", []>;
+def WRITEPARAMF32  : InstPTX<(outs), (ins RegF32:$a), "//w", []>;
+def WRITEPARAMF64  : InstPTX<(outs), (ins RegF64:$a), "//w", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Selection Patterns
+//===----------------------------------------------------------------------===//
+
+// FADD
+def : Pat<(f32 (fadd RegF32:$a, RegF32:$b)),
+          (FADDrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fadd RegF32:$a, fpimm:$b)),
+          (FADDri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fadd RegF64:$a, RegF64:$b)),
+          (FADDrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fadd RegF64:$a, fpimm:$b)),
+          (FADDri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FSUB
+def : Pat<(f32 (fsub RegF32:$a, RegF32:$b)),
+          (FSUBrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fsub RegF32:$a, fpimm:$b)),
+          (FSUBri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fsub RegF64:$a, RegF64:$b)),
+          (FSUBrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fsub RegF64:$a, fpimm:$b)),
+          (FSUBri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FMUL
+def : Pat<(f32 (fmul RegF32:$a, RegF32:$b)),
+          (FMULrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fmul RegF32:$a, fpimm:$b)),
+          (FMULri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fmul RegF64:$a, RegF64:$b)),
+          (FMULrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fmul RegF64:$a, fpimm:$b)),
+          (FMULri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FDIV
+def : Pat<(f32 (fdiv RegF32:$a, RegF32:$b)),
+          (FDIVrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fdiv RegF32:$a, fpimm:$b)),
+          (FDIVri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fdiv RegF64:$a, RegF64:$b)),
+          (FDIVrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fdiv RegF64:$a, fpimm:$b)),
+          (FDIVri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FMUL+FADD
+def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), RegF32:$c)),
+          (FMADrrr32 RndDefault, RegF32:$a, RegF32:$b, RegF32:$c)>,
+          Requires<[SupportsFMA]>;
+def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
+          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>,
+          Requires<[SupportsFMA]>;
+def : Pat<(f32 (fadd (fmul RegF32:$a, fpimm:$b), fpimm:$c)),
+          (FMADrrr32 RndDefault, RegF32:$a, fpimm:$b, fpimm:$c)>,
+          Requires<[SupportsFMA]>;
+def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
+          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>,
+          Requires<[SupportsFMA]>;
+def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), RegF64:$c)),
+          (FMADrrr64 RndDefault, RegF64:$a, RegF64:$b, RegF64:$c)>,
+          Requires<[SupportsFMA]>;
+def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), fpimm:$c)),
+          (FMADrri64 RndDefault, RegF64:$a, RegF64:$b, fpimm:$c)>,
+          Requires<[SupportsFMA]>;
+def : Pat<(f64 (fadd (fmul RegF64:$a, fpimm:$b), fpimm:$c)),
+          (FMADrri64 RndDefault, RegF64:$a, fpimm:$b, fpimm:$c)>,
+          Requires<[SupportsFMA]>;
+
+// FNEG
+def : Pat<(f32 (fneg RegF32:$a)), (FNEGrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fneg fpimm:$a)), (FNEGri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fneg RegF64:$a)), (FNEGrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fneg fpimm:$a)), (FNEGri64 RndDefault, fpimm:$a)>;
+
+// FSQRT
+def : Pat<(f32 (fsqrt RegF32:$a)), (FSQRTrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fsqrt fpimm:$a)), (FSQRTri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fsqrt RegF64:$a)), (FSQRTrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fsqrt fpimm:$a)), (FSQRTri64 RndDefault, fpimm:$a)>;
+
+// FSIN
+def : Pat<(f32 (fsin RegF32:$a)), (FSINrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fsin fpimm:$a)), (FSINri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fsin RegF64:$a)), (FSINrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fsin fpimm:$a)), (FSINri64 RndDefault, fpimm:$a)>;
+
+// FCOS
+def : Pat<(f32 (fcos RegF32:$a)), (FCOSrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fcos fpimm:$a)), (FCOSri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fcos RegF64:$a)), (FCOSrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fcos fpimm:$a)), (FCOSri64 RndDefault, fpimm:$a)>;
+
+// Type conversion notes:
+// - PTX does not directly support converting a predicate to a value, so we
+//   use a select instruction to select either 0 or 1 (integer or fp) based
+//   on the truth value of the predicate.
+// - PTX does not directly support converting to a predicate type, so we fake it
+//   by performing a greater-than test between the value and zero.  This follows
+//   the C convention that any non-zero value is equivalent to 'true'.
+
+// Conversion to pred
+def : Pat<(i1 (trunc RegI16:$a)),      (SETPGTu16ri RegI16:$a, 0)>;
+def : Pat<(i1 (trunc RegI32:$a)),      (SETPGTu32ri RegI32:$a, 0)>;
+def : Pat<(i1 (trunc RegI64:$a)),      (SETPGTu64ri RegI64:$a, 0)>;
+def : Pat<(i1 (fp_to_uint RegF32:$a)), (SETPGTu32ri (MOVi32f32 RegF32:$a), 0)>;
+def : Pat<(i1 (fp_to_uint RegF64:$a)), (SETPGTu64ri (MOVi64f64 RegF64:$a), 0)>;
+
+// Conversion to u16
+def : Pat<(i16 (anyext RegPred:$a)),    (SELPi16ii RegPred:$a, 1, 0)>;
+def : Pat<(i16 (sext RegPred:$a)),      (SELPi16ii RegPred:$a, 0xFFFF, 0)>;
+def : Pat<(i16 (zext RegPred:$a)),      (SELPi16ii RegPred:$a, 1, 0)>;
+def : Pat<(i16 (trunc RegI32:$a)),      (CVTu16u32 RegI32:$a)>;
+def : Pat<(i16 (trunc RegI64:$a)),      (CVTu16u64 RegI64:$a)>;
+def : Pat<(i16 (fp_to_uint RegF32:$a)), (CVTu16f32 RndDefault, RegF32:$a)>;
+def : Pat<(i16 (fp_to_sint RegF32:$a)), (CVTs16f32 RndDefault, RegF32:$a)>;
+def : Pat<(i16 (fp_to_uint RegF64:$a)), (CVTu16f64 RndDefault, RegF64:$a)>;
+def : Pat<(i16 (fp_to_sint RegF64:$a)), (CVTs16f64 RndDefault, RegF64:$a)>;
+
+// Conversion to u32
+def : Pat<(i32 (anyext RegPred:$a)),    (SELPi32ii RegPred:$a, 1, 0)>;
+def : Pat<(i32 (sext RegPred:$a)),      (SELPi32ii RegPred:$a, 0xFFFFFFFF, 0)>;
+def : Pat<(i32 (zext RegPred:$a)),      (SELPi32ii RegPred:$a, 1, 0)>;
+def : Pat<(i32 (anyext RegI16:$a)),     (CVTu32u16 RegI16:$a)>;
+def : Pat<(i32 (sext RegI16:$a)),       (CVTs32s16 RegI16:$a)>;
+def : Pat<(i32 (zext RegI16:$a)),       (CVTu32u16 RegI16:$a)>;
+def : Pat<(i32 (trunc RegI64:$a)),      (CVTu32u64 RegI64:$a)>;
+def : Pat<(i32 (fp_to_uint RegF32:$a)), (CVTu32f32 RndDefault, RegF32:$a)>;
+def : Pat<(i32 (fp_to_sint RegF32:$a)), (CVTs32f32 RndDefault, RegF32:$a)>;
+def : Pat<(i32 (fp_to_uint RegF64:$a)), (CVTu32f64 RndDefault, RegF64:$a)>;
+def : Pat<(i32 (fp_to_sint RegF64:$a)), (CVTs32f64 RndDefault, RegF64:$a)>;
+def : Pat<(i32 (bitconvert RegF32:$a)), (MOVi32f32 RegF32:$a)>;
+
+// Conversion to u64
+def : Pat<(i64 (anyext RegPred:$a)),    (SELPi64ii RegPred:$a, 1, 0)>;
+def : Pat<(i64 (sext RegPred:$a)),      (SELPi64ii RegPred:$a,
+                                         0xFFFFFFFFFFFFFFFF, 0)>;
+def : Pat<(i64 (zext RegPred:$a)),      (SELPi64ii RegPred:$a, 1, 0)>;
+def : Pat<(i64 (anyext RegI16:$a)),     (CVTu64u16 RegI16:$a)>;
+def : Pat<(i64 (sext RegI16:$a)),       (CVTs64s16 RegI16:$a)>;
+def : Pat<(i64 (zext RegI16:$a)),       (CVTu64u16 RegI16:$a)>;
+def : Pat<(i64 (anyext RegI32:$a)),     (CVTu64u32 RegI32:$a)>;
+def : Pat<(i64 (sext RegI32:$a)),       (CVTs64s32 RegI32:$a)>;
+def : Pat<(i64 (zext RegI32:$a)),       (CVTu64u32 RegI32:$a)>;
+def : Pat<(i64 (fp_to_uint RegF32:$a)), (CVTu64f32 RndDefault, RegF32:$a)>;
+def : Pat<(i64 (fp_to_sint RegF32:$a)), (CVTs64f32 RndDefault, RegF32:$a)>;
+def : Pat<(i64 (fp_to_uint RegF64:$a)), (CVTu64f64 RndDefault, RegF64:$a)>;
+def : Pat<(i64 (fp_to_sint RegF64:$a)), (CVTs64f64 RndDefault, RegF64:$a)>;
+def : Pat<(i64 (bitconvert RegF64:$a)), (MOVi64f64 RegF64:$a)>;
+
+// Conversion to f32
+def : Pat<(f32 (uint_to_fp RegPred:$a)), (SELPf32rr RegPred:$a,
+                                        (MOVf32i32 0x3F800000), (MOVf32i32 0))>;
+def : Pat<(f32 (uint_to_fp RegI16:$a)),  (CVTf32u16 RndDefault, RegI16:$a)>;
+def : Pat<(f32 (sint_to_fp RegI16:$a)),  (CVTf32s16 RndDefault, RegI16:$a)>;
+def : Pat<(f32 (uint_to_fp RegI32:$a)),  (CVTf32u32 RndDefault, RegI32:$a)>;
+def : Pat<(f32 (sint_to_fp RegI32:$a)),  (CVTf32s32 RndDefault, RegI32:$a)>;
+def : Pat<(f32 (uint_to_fp RegI64:$a)),  (CVTf32u64 RndDefault, RegI64:$a)>;
+def : Pat<(f32 (sint_to_fp RegI64:$a)),  (CVTf32s64 RndDefault, RegI64:$a)>;
+def : Pat<(f32 (fround RegF64:$a)),      (CVTf32f64 RndDefault, RegF64:$a)>;
+def : Pat<(f32 (bitconvert RegI32:$a)),  (MOVf32i32 RegI32:$a)>;
+
+// Conversion to f64
+def : Pat<(f64 (uint_to_fp RegPred:$a)), (SELPf64rr RegPred:$a,
+                                (MOVf64i64 0x3F80000000000000), (MOVf64i64 0))>;
+def : Pat<(f64 (uint_to_fp RegI16:$a)), (CVTf64u16 RndDefault, RegI16:$a)>;
+def : Pat<(f64 (sint_to_fp RegI16:$a)), (CVTf64s16 RndDefault, RegI16:$a)>;
+def : Pat<(f64 (uint_to_fp RegI32:$a)), (CVTf64u32 RndDefault, RegI32:$a)>;
+def : Pat<(f64 (sint_to_fp RegI32:$a)), (CVTf64s32 RndDefault, RegI32:$a)>;
+def : Pat<(f64 (uint_to_fp RegI64:$a)), (CVTf64u64 RndDefault, RegI64:$a)>;
+def : Pat<(f64 (sint_to_fp RegI64:$a)), (CVTf64s64 RndDefault, RegI64:$a)>;
+def : Pat<(f64 (fextend RegF32:$a)),    (CVTf64f32 RegF32:$a)>;
+def : Pat<(f64 (bitconvert RegI64:$a)), (MOVf64i64 RegI64:$a)>;
+
+// setcc - predicate inversion for branch conditions
+def : Pat<(i1 (setcc RegPred:$a, imm:$b, SETNE)),
+          (XORripreds RegPred:$a, imm:$b)>;
+
+///===- Intrinsic Instructions --------------------------------------------===//
+include "PTXIntrinsicInstrInfo.td"
+
+///===- Load/Store Instructions -------------------------------------------===//
+include "PTXInstrLoadStore.td"
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXInstrLoadStore.td b/contrib/llvm/lib/Target/PTX/PTXInstrLoadStore.td
new file mode 100644
index 0000000..7a62684
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXInstrLoadStore.td
@@ -0,0 +1,278 @@
+//===- PTXInstrLoadStore.td - PTX Load/Store Instruction Defs -*- tablegen-*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX load/store instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+// Addressing Predicates
+// We have to differentiate between 32- and 64-bit pointer types
+def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">;
+def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">;
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments for Loads/Stores
+//===----------------------------------------------------------------------===//
+
+def load_global : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Global;
+  return false;
+}]>;
+
+def load_constant : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Constant;
+  return false;
+}]>;
+
+def load_shared : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Shared;
+  return false;
+}]>;
+
+def store_global
+  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Global;
+  return false;
+}]>;
+
+def store_shared
+  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Shared;
+  return false;
+}]>;
+
+// Addressing modes.
+def ADDRrr32    : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRrr64    : ComplexPattern<i64, 2, "SelectADDRrr", [], []>;
+def ADDRri32    : ComplexPattern<i32, 2, "SelectADDRri", [], []>;
+def ADDRri64    : ComplexPattern<i64, 2, "SelectADDRri", [], []>;
+def ADDRii32    : ComplexPattern<i32, 2, "SelectADDRii", [], []>;
+def ADDRii64    : ComplexPattern<i64, 2, "SelectADDRii", [], []>;
+def ADDRlocal32 : ComplexPattern<i32, 2, "SelectADDRlocal", [], []>;
+def ADDRlocal64 : ComplexPattern<i64, 2, "SelectADDRlocal", [], []>;
+
+// Address operands
+def MEMri32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops RegI32, i32imm);
+}
+def MEMri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops RegI64, i64imm);
+}
+def LOCALri32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+def LOCALri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i64imm, i64imm);
+}
+def MEMii32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+def MEMii64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i64imm, i64imm);
+}
+// The operand here does not correspond to an actual address, so we
+// can use i32 in 64-bit address modes.
+def MEMpi : Operand<i32> {
+  let PrintMethod = "printParamOperand";
+  let MIOperandInfo = (ops i32imm);
+}
+def MEMret : Operand<i32> {
+  let PrintMethod = "printReturnOperand";
+  let MIOperandInfo = (ops i32imm);
+}
+
+
+// Load/store .param space
+def PTXloadparam
+  : SDNode<"PTXISD::LOAD_PARAM", SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>,
+           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+def PTXstoreparam
+  : SDNode<"PTXISD::STORE_PARAM", SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>,
+           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+
+def PTXreadparam
+  : SDNode<"PTXISD::READ_PARAM", SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>,
+      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+def PTXwriteparam
+  : SDNode<"PTXISD::WRITE_PARAM", SDTypeProfile<0, 1, []>,
+      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+
+
+
+//===----------------------------------------------------------------------===//
+// Classes for loads/stores
+//===----------------------------------------------------------------------===//
+multiclass PTX_LD<string opstr, string typestr,
+           RegisterClass RC, PatFrag pat_load> {
+  def rr32 : InstPTX<(outs RC:$d),
+                     (ins MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRrr32:$a))]>,
+                     Requires<[Use32BitAddresses]>;
+  def rr64 : InstPTX<(outs RC:$d),
+                     (ins MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRrr64:$a))]>,
+                     Requires<[Use64BitAddresses]>;
+  def ri32 : InstPTX<(outs RC:$d),
+                     (ins MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRri32:$a))]>,
+                     Requires<[Use32BitAddresses]>;
+  def ri64 : InstPTX<(outs RC:$d),
+                     (ins MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRri64:$a))]>,
+                     Requires<[Use64BitAddresses]>;
+  def ii32 : InstPTX<(outs RC:$d),
+                     (ins MEMii32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRii32:$a))]>,
+                     Requires<[Use32BitAddresses]>;
+  def ii64 : InstPTX<(outs RC:$d),
+                     (ins MEMii64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRii64:$a))]>,
+                     Requires<[Use64BitAddresses]>;
+}
+
+multiclass PTX_ST<string opstr, string typestr, RegisterClass RC,
+                  PatFrag pat_store> {
+  def rr32 : InstPTX<(outs),
+                     (ins RC:$d, MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                     [(pat_store RC:$d, ADDRrr32:$a)]>,
+                     Requires<[Use32BitAddresses]>;
+  def rr64 : InstPTX<(outs),
+                     (ins RC:$d, MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                     [(pat_store RC:$d, ADDRrr64:$a)]>,
+                     Requires<[Use64BitAddresses]>;
+  def ri32 : InstPTX<(outs),
+                   (ins RC:$d, MEMri32:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRri32:$a)]>,
+                   Requires<[Use32BitAddresses]>;
+  def ri64 : InstPTX<(outs),
+                   (ins RC:$d, MEMri64:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRri64:$a)]>,
+                   Requires<[Use64BitAddresses]>;
+  def ii32 : InstPTX<(outs),
+                   (ins RC:$d, MEMii32:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRii32:$a)]>,
+                   Requires<[Use32BitAddresses]>;
+  def ii64 : InstPTX<(outs),
+                   (ins RC:$d, MEMii64:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRii64:$a)]>,
+                   Requires<[Use64BitAddresses]>;
+}
+
+multiclass PTX_LOCAL_LD_ST<string typestr, RegisterClass RC> {
+  def LDri32 : InstPTX<(outs RC:$d), (ins LOCALri32:$a),
+                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
+                       [(set RC:$d, (load_global ADDRlocal32:$a))]>;
+  def LDri64 : InstPTX<(outs RC:$d), (ins LOCALri64:$a),
+                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
+                       [(set RC:$d, (load_global ADDRlocal64:$a))]>;
+  def STri32 : InstPTX<(outs), (ins RC:$d, LOCALri32:$a),
+                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
+                       [(store_global RC:$d, ADDRlocal32:$a)]>;
+  def STri64 : InstPTX<(outs), (ins RC:$d, LOCALri64:$a),
+                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
+                       [(store_global RC:$d, ADDRlocal64:$a)]>;
+}
+
+multiclass PTX_PARAM_LD_ST<string typestr, RegisterClass RC> {
+  let hasSideEffects = 1 in {
+  def LDpi : InstPTX<(outs RC:$d), (ins i32imm:$a),
+                     !strconcat("ld.param", !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (PTXloadparam texternalsym:$a))]>;
+  def STpi : InstPTX<(outs), (ins i32imm:$d, RC:$a),
+                     !strconcat("st.param", !strconcat(typestr, "\t[$d], $a")),
+                     [(PTXstoreparam texternalsym:$d, RC:$a)]>;
+  }
+}
+
+multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
+  defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>;
+  defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>;
+  defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>;
+  defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>;
+  defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>;
+}
+
+multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
+  defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>;
+  defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>;
+  defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>;
+  defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>;
+  defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for loads/stores
+//===----------------------------------------------------------------------===//
+
+// Global/shared stores
+defm STg : PTX_ST_ALL<"st.global", store_global>;
+defm STs : PTX_ST_ALL<"st.shared", store_shared>;
+
+// Global/shared/constant loads
+defm LDg : PTX_LD_ALL<"ld.global", load_global>;
+defm LDc : PTX_LD_ALL<"ld.const",  load_constant>;
+defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
+
+// Param loads/stores
+defm PARAMPRED : PTX_PARAM_LD_ST<".pred", RegPred>;
+defm PARAMU16  : PTX_PARAM_LD_ST<".u16", RegI16>;
+defm PARAMU32  : PTX_PARAM_LD_ST<".u32", RegI32>;
+defm PARAMU64  : PTX_PARAM_LD_ST<".u64", RegI64>;
+defm PARAMF32  : PTX_PARAM_LD_ST<".f32", RegF32>;
+defm PARAMF64  : PTX_PARAM_LD_ST<".f64", RegF64>;
+
+// Local loads/stores
+defm LOCALPRED : PTX_LOCAL_LD_ST<".pred", RegPred>;
+defm LOCALU16  : PTX_LOCAL_LD_ST<".u16", RegI16>;
+defm LOCALU32  : PTX_LOCAL_LD_ST<".u32", RegI32>;
+defm LOCALU64  : PTX_LOCAL_LD_ST<".u64", RegI64>;
+defm LOCALF32  : PTX_LOCAL_LD_ST<".f32", RegF32>;
+defm LOCALF64  : PTX_LOCAL_LD_ST<".f64", RegF64>;
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/contrib/llvm/lib/Target/PTX/PTXIntrinsicInstrInfo.td
new file mode 100644
index 0000000..3416f1c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXIntrinsicInstrInfo.td
@@ -0,0 +1,110 @@
+//===-- PTXIntrinsicInstrInfo.td - Defines PTX intrinsics --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the PTX-specific intrinsic instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// PTX Special Purpose Register Accessor Intrinsics
+
+class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
+  : InstPTX<(outs RegI64:$d), (ins),
+            !strconcat("mov.u64\t$d, %", regname),
+            [(set RegI64:$d, (intop))]>;
+
+class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
+  : InstPTX<(outs RegI32:$d), (ins),
+            !strconcat("mov.u32\t$d, %", regname),
+            [(set RegI32:$d, (intop))]>;
+
+// TODO Add read vector-version of special registers
+
+//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid",
+//                                                     int_ptx_read_tid_r64>;
+def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
+                                                     int_ptx_read_tid_x>;
+def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
+                                                     int_ptx_read_tid_y>;
+def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
+                                                     int_ptx_read_tid_z>;
+def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
+                                                     int_ptx_read_tid_w>;
+
+//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid",
+//                                                      int_ptx_read_ntid_r64>;
+def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
+                                                      int_ptx_read_ntid_x>;
+def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
+                                                      int_ptx_read_ntid_y>;
+def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
+                                                      int_ptx_read_ntid_z>;
+def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
+                                                      int_ptx_read_ntid_w>;
+
+def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
+                                                     int_ptx_read_laneid>;
+def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
+                                                     int_ptx_read_warpid>;
+def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
+                                                     int_ptx_read_nwarpid>;
+
+//def PTX_READ_CTAID_R64 :
+//PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>;
+def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
+                                                       int_ptx_read_ctaid_x>;
+def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
+                                                       int_ptx_read_ctaid_y>;
+def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
+                                                       int_ptx_read_ctaid_z>;
+def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
+                                                       int_ptx_read_ctaid_w>;
+
+//def PTX_READ_NCTAID_R64 :
+//PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>;
+def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
+                                                        int_ptx_read_nctaid_x>;
+def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
+                                                        int_ptx_read_nctaid_y>;
+def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
+                                                        int_ptx_read_nctaid_z>;
+def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
+                                                        int_ptx_read_nctaid_w>;
+
+def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
+                                                   int_ptx_read_smid>;
+def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
+                                                    int_ptx_read_nsmid>;
+def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
+                                                     int_ptx_read_gridid>;
+
+def PTX_READ_LANEMASK_EQ
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
+def PTX_READ_LANEMASK_LE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
+def PTX_READ_LANEMASK_LT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
+def PTX_READ_LANEMASK_GE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
+def PTX_READ_LANEMASK_GT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
+
+def PTX_READ_CLOCK
+  : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
+def PTX_READ_CLOCK64
+  : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
+
+def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
+def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
+def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
+def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
+
+// PTX Parallel Synchronization and Communication Intrinsics
+
+def PTX_BAR_SYNC : InstPTX<(outs), (ins i32imm:$i), "bar.sync\t$i",
+                           [(int_ptx_bar_sync imm:$i)]>;
diff --git a/contrib/llvm/lib/Target/PTX/PTXMCAsmStreamer.cpp b/contrib/llvm/lib/Target/PTX/PTXMCAsmStreamer.cpp
new file mode 100644
index 0000000..3ed67a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXMCAsmStreamer.cpp
@@ -0,0 +1,556 @@
+//===-- PTXMCAsmStreamer.cpp - PTX Text Assembly Output -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/PathV2.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class PTXMCAsmStreamer : public MCStreamer {
+  formatted_raw_ostream &OS;
+  const MCAsmInfo &MAI;
+  OwningPtr<MCInstPrinter> InstPrinter;
+  OwningPtr<MCCodeEmitter> Emitter;
+
+  SmallString<128> CommentToEmit;
+  raw_svector_ostream CommentStream;
+
+  unsigned IsVerboseAsm : 1;
+  unsigned ShowInst : 1;
+
+public:
+  PTXMCAsmStreamer(MCContext &Context,
+                   formatted_raw_ostream &os,
+                   bool isVerboseAsm, bool useLoc,
+                   MCInstPrinter *printer,
+                   MCCodeEmitter *emitter,
+                   bool showInst)
+    : MCStreamer(Context), OS(os), MAI(Context.getAsmInfo()),
+      InstPrinter(printer), Emitter(emitter), CommentStream(CommentToEmit),
+      IsVerboseAsm(isVerboseAsm),
+      ShowInst(showInst) {
+    if (InstPrinter && IsVerboseAsm)
+      InstPrinter->setCommentStream(CommentStream);
+  }
+
+  ~PTXMCAsmStreamer() {}
+
+  inline void EmitEOL() {
+    // If we don't have any comments, just emit a \n.
+    if (!IsVerboseAsm) {
+      OS << '\n';
+      return;
+    }
+    EmitCommentsAndEOL();
+  }
+  void EmitCommentsAndEOL();
+
+  /// isVerboseAsm - Return true if this streamer supports verbose assembly at
+  /// all.
+  virtual bool isVerboseAsm() const { return IsVerboseAsm; }
+
+  /// hasRawTextSupport - We support EmitRawText.
+  virtual bool hasRawTextSupport() const { return true; }
+
+  /// AddComment - Add a comment that can be emitted to the generated .s
+  /// file if applicable as a QoI issue to make the output of the compiler
+  /// more readable.  This only affects the MCAsmStreamer, and only when
+  /// verbose assembly output is enabled.
+  virtual void AddComment(const Twine &T);
+
+  /// AddEncodingComment - Add a comment showing the encoding of an instruction.
+  virtual void AddEncodingComment(const MCInst &Inst);
+
+  /// GetCommentOS - Return a raw_ostream that comments can be written to.
+  /// Unlike AddComment, you are required to terminate comments with \n if you
+  /// use this method.
+  virtual raw_ostream &GetCommentOS() {
+    if (!IsVerboseAsm)
+      return nulls();  // Discard comments unless in verbose asm mode.
+    return CommentStream;
+  }
+
+  /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
+  virtual void AddBlankLine() {
+    EmitEOL();
+  }
+
+  /// @name MCStreamer Interface
+  /// @{
+
+  virtual void ChangeSection(const MCSection *Section);
+  virtual void InitSections() { /* PTX does not use sections */ }
+
+  virtual void EmitLabel(MCSymbol *Symbol);
+
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+
+  virtual void EmitThumbFunc(MCSymbol *Func);
+
+  virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
+
+  virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
+
+  virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                        const MCSymbol *LastLabel,
+                                        const MCSymbol *Label,
+                                        unsigned PointerSize);
+
+  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+
+  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
+  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
+  virtual void EmitCOFFSymbolStorageClass(int StorageClass);
+  virtual void EmitCOFFSymbolType(int Type);
+  virtual void EndCOFFSymbolDef();
+  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
+  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                unsigned ByteAlignment);
+
+  /// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol.
+  ///
+  /// @param Symbol - The common symbol to emit.
+  /// @param Size - The size of the common symbol.
+  /// @param ByteAlignment - The alignment of the common symbol in bytes.
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment);
+
+  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+                            unsigned Size = 0, unsigned ByteAlignment = 0);
+
+  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                              uint64_t Size, unsigned ByteAlignment = 0);
+
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
+
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                             unsigned AddrSpace);
+  virtual void EmitULEB128Value(const MCExpr *Value);
+  virtual void EmitSLEB128Value(const MCExpr *Value);
+  virtual void EmitGPRel32Value(const MCExpr *Value);
+
+
+  virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue,
+                        unsigned AddrSpace);
+
+  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
+                                    unsigned ValueSize = 1,
+                                    unsigned MaxBytesToEmit = 0);
+
+  virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                 unsigned MaxBytesToEmit = 0);
+
+  virtual bool EmitValueToOffset(const MCExpr *Offset,
+                                 unsigned char Value = 0);
+
+  virtual void EmitFileDirective(StringRef Filename);
+  virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
+                                      StringRef Filename);
+
+  virtual void EmitInstruction(const MCInst &Inst);
+
+  /// EmitRawText - If this file is backed by an assembly streamer, this dumps
+  /// the specified string in the output .s file.  This capability is
+  /// indicated by the hasRawTextSupport() predicate.
+  virtual void EmitRawText(StringRef String);
+
+  virtual void FinishImpl();
+
+  /// @}
+
+}; // class PTXMCAsmStreamer
+
+}
+
+/// TODO: Add appropriate implementation of Emit*() methods when needed
+
+void PTXMCAsmStreamer::AddComment(const Twine &T) {
+  if (!IsVerboseAsm) return;
+
+  // Make sure that CommentStream is flushed.
+  CommentStream.flush();
+
+  T.toVector(CommentToEmit);
+  // Each comment goes on its own line.
+  CommentToEmit.push_back('\n');
+
+  // Tell the comment stream that the vector changed underneath it.
+  CommentStream.resync();
+}
+
+void PTXMCAsmStreamer::EmitCommentsAndEOL() {
+  if (CommentToEmit.empty() && CommentStream.GetNumBytesInBuffer() == 0) {
+    OS << '\n';
+    return;
+  }
+
+  CommentStream.flush();
+  StringRef Comments = CommentToEmit.str();
+
+  assert(Comments.back() == '\n' &&
+         "Comment array not newline terminated");
+  do {
+    // Emit a line of comments.
+    OS.PadToColumn(MAI.getCommentColumn());
+    size_t Position = Comments.find('\n');
+    OS << MAI.getCommentString() << ' ' << Comments.substr(0, Position) << '\n';
+
+    Comments = Comments.substr(Position+1);
+  } while (!Comments.empty());
+
+  CommentToEmit.clear();
+  // Tell the comment stream that the vector changed underneath it.
+  CommentStream.resync();
+}
+
+static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
+  assert(Bytes && "Invalid size!");
+  return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8));
+}
+
+void PTXMCAsmStreamer::ChangeSection(const MCSection *Section) {
+  assert(Section && "Cannot switch to a null section!");
+}
+
+void PTXMCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
+  assert(getCurrentSection() && "Cannot emit before setting section!");
+
+  OS << *Symbol << MAI.getLabelSuffix();
+  EmitEOL();
+  Symbol->setSection(*getCurrentSection());
+}
+
+void PTXMCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {}
+
+void PTXMCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {}
+
+void PTXMCAsmStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+  OS << *Symbol << " = " << *Value;
+  EmitEOL();
+
+  // FIXME: Lift context changes into super class.
+  Symbol->setVariableValue(Value);
+}
+
+void PTXMCAsmStreamer::EmitWeakReference(MCSymbol *Alias,
+                                         const MCSymbol *Symbol) {
+  OS << ".weakref " << *Alias << ", " << *Symbol;
+  EmitEOL();
+}
+
+void PTXMCAsmStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                                const MCSymbol *LastLabel,
+                                                const MCSymbol *Label,
+                                                unsigned PointerSize) {
+  report_fatal_error("Unimplemented.");
+}
+
+void PTXMCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                           MCSymbolAttr Attribute) {}
+
+void PTXMCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
+
+void PTXMCAsmStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
+
+void PTXMCAsmStreamer::EmitCOFFSymbolStorageClass (int StorageClass) {}
+
+void PTXMCAsmStreamer::EmitCOFFSymbolType (int Type) {}
+
+void PTXMCAsmStreamer::EndCOFFSymbolDef() {}
+
+void PTXMCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
+
+void PTXMCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                        unsigned ByteAlignment) {}
+
+void PTXMCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                             unsigned ByteAlignment) {}
+
+void PTXMCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                                    unsigned Size, unsigned ByteAlignment) {}
+
+void PTXMCAsmStreamer::EmitTBSSSymbol(const MCSection *Section,
+                                      MCSymbol *Symbol,
+                                      uint64_t Size, unsigned ByteAlignment) {}
+
+static inline char toOctal(int X) { return (X&7)+'0'; }
+
+static void PrintQuotedString(StringRef Data, raw_ostream &OS) {
+  OS << '"';
+
+  for (unsigned i = 0, e = Data.size(); i != e; ++i) {
+    unsigned char C = Data[i];
+    if (C == '"' || C == '\\') {
+      OS << '\\' << (char)C;
+      continue;
+    }
+
+    if (isprint((unsigned char)C)) {
+      OS << (char)C;
+      continue;
+    }
+
+    switch (C) {
+      case '\b': OS << "\\b"; break;
+      case '\f': OS << "\\f"; break;
+      case '\n': OS << "\\n"; break;
+      case '\r': OS << "\\r"; break;
+      case '\t': OS << "\\t"; break;
+      default:
+        OS << '\\';
+        OS << toOctal(C >> 6);
+        OS << toOctal(C >> 3);
+        OS << toOctal(C >> 0);
+        break;
+    }
+  }
+
+  OS << '"';
+}
+
+void PTXMCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+  assert(getCurrentSection() && "Cannot emit contents before setting section!");
+  if (Data.empty()) return;
+
+  if (Data.size() == 1) {
+    OS << MAI.getData8bitsDirective(AddrSpace);
+    OS << (unsigned)(unsigned char)Data[0];
+    EmitEOL();
+    return;
+  }
+
+  // If the data ends with 0 and the target supports .asciz, use it, otherwise
+  // use .ascii
+  if (MAI.getAscizDirective() && Data.back() == 0) {
+    OS << MAI.getAscizDirective();
+    Data = Data.substr(0, Data.size()-1);
+  } else {
+    OS << MAI.getAsciiDirective();
+  }
+
+  OS << ' ';
+  PrintQuotedString(Data, OS);
+  EmitEOL();
+}
+
+void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                     unsigned AddrSpace) {
+  assert(getCurrentSection() && "Cannot emit contents before setting section!");
+  const char *Directive = 0;
+  switch (Size) {
+  default: break;
+  case 1: Directive = MAI.getData8bitsDirective(AddrSpace); break;
+  case 2: Directive = MAI.getData16bitsDirective(AddrSpace); break;
+  case 4: Directive = MAI.getData32bitsDirective(AddrSpace); break;
+  case 8:
+    Directive = MAI.getData64bitsDirective(AddrSpace);
+    // If the target doesn't support 64-bit data, emit as two 32-bit halves.
+    if (Directive) break;
+    int64_t IntValue;
+    if (!Value->EvaluateAsAbsolute(IntValue))
+      report_fatal_error("Don't know how to emit this value.");
+    if (getContext().getAsmInfo().isLittleEndian()) {
+      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
+      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
+    } else {
+      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
+      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
+    }
+    return;
+  }
+
+  assert(Directive && "Invalid size for machine code value!");
+  OS << Directive << *Value;
+  EmitEOL();
+}
+
+void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
+  assert(MAI.hasLEB128() && "Cannot print a .uleb");
+  OS << ".uleb128 " << *Value;
+  EmitEOL();
+}
+
+void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
+  assert(MAI.hasLEB128() && "Cannot print a .sleb");
+  OS << ".sleb128 " << *Value;
+  EmitEOL();
+}
+
+void PTXMCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
+  assert(MAI.getGPRel32Directive() != 0);
+  OS << MAI.getGPRel32Directive() << *Value;
+  EmitEOL();
+}
+
+
+/// EmitFill - Emit NumBytes bytes worth of the value specified by
+/// FillValue.  This implements directives such as '.space'.
+void PTXMCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
+                                unsigned AddrSpace) {
+  if (NumBytes == 0) return;
+
+  if (AddrSpace == 0)
+    if (const char *ZeroDirective = MAI.getZeroDirective()) {
+      OS << ZeroDirective << NumBytes;
+      if (FillValue != 0)
+        OS << ',' << (int)FillValue;
+      EmitEOL();
+      return;
+    }
+
+  // Emit a byte at a time.
+  MCStreamer::EmitFill(NumBytes, FillValue, AddrSpace);
+}
+
+void PTXMCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+                                            int64_t Value,
+                                            unsigned ValueSize,
+                                            unsigned MaxBytesToEmit) {
+  // Some assemblers don't support non-power of two alignments, so we always
+  // emit alignments as a power of two if possible.
+  if (isPowerOf2_32(ByteAlignment)) {
+    switch (ValueSize) {
+    default: llvm_unreachable("Invalid size for machine code value!");
+    case 1: OS << MAI.getAlignDirective(); break;
+    // FIXME: use MAI for this!
+    case 2: OS << ".p2alignw "; break;
+    case 4: OS << ".p2alignl "; break;
+    case 8: llvm_unreachable("Unsupported alignment size!");
+    }
+
+    if (MAI.getAlignmentIsInBytes())
+      OS << ByteAlignment;
+    else
+      OS << Log2_32(ByteAlignment);
+
+    if (Value || MaxBytesToEmit) {
+      OS << ", 0x";
+      OS.write_hex(truncateToSize(Value, ValueSize));
+
+      if (MaxBytesToEmit)
+        OS << ", " << MaxBytesToEmit;
+    }
+    EmitEOL();
+    return;
+  }
+
+  // Non-power of two alignment.  This is not widely supported by assemblers.
+  // FIXME: Parameterize this based on MAI.
+  switch (ValueSize) {
+  default: llvm_unreachable("Invalid size for machine code value!");
+  case 1: OS << ".balign";  break;
+  case 2: OS << ".balignw"; break;
+  case 4: OS << ".balignl"; break;
+  case 8: llvm_unreachable("Unsupported alignment size!");
+  }
+
+  OS << ' ' << ByteAlignment;
+  OS << ", " << truncateToSize(Value, ValueSize);
+  if (MaxBytesToEmit)
+    OS << ", " << MaxBytesToEmit;
+  EmitEOL();
+}
+
+void PTXMCAsmStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+                                         unsigned MaxBytesToEmit) {}
+
+bool PTXMCAsmStreamer::EmitValueToOffset(const MCExpr *Offset,
+                                         unsigned char Value) {return false;}
+
+
+void PTXMCAsmStreamer::EmitFileDirective(StringRef Filename) {
+  assert(MAI.hasSingleParameterDotFile());
+  OS << "\t.file\t";
+  PrintQuotedString(Filename, OS);
+  EmitEOL();
+}
+
+// FIXME: should we inherit from MCAsmStreamer?
+bool PTXMCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo,
+                                              StringRef Directory,
+                                              StringRef Filename) {
+  if (!Directory.empty()) {
+    if (sys::path::is_absolute(Filename))
+      return EmitDwarfFileDirective(FileNo, "", Filename);
+    SmallString<128> FullPathName = Directory;
+    sys::path::append(FullPathName, Filename);
+    return EmitDwarfFileDirective(FileNo, "", FullPathName);
+  }
+
+  OS << "\t.file\t" << FileNo << ' ';
+  PrintQuotedString(Filename, OS);
+  EmitEOL();
+  return this->MCStreamer::EmitDwarfFileDirective(FileNo, Directory, Filename);
+}
+
+void PTXMCAsmStreamer::AddEncodingComment(const MCInst &Inst) {}
+
+void PTXMCAsmStreamer::EmitInstruction(const MCInst &Inst) {
+  assert(getCurrentSection() && "Cannot emit contents before setting section!");
+
+  // Show the encoding in a comment if we have a code emitter.
+  if (Emitter)
+    AddEncodingComment(Inst);
+
+  // Show the MCInst if enabled.
+  if (ShowInst) {
+    Inst.dump_pretty(GetCommentOS(), &MAI, InstPrinter.get(), "\n ");
+    GetCommentOS() << "\n";
+  }
+
+  // If we have an AsmPrinter, use that to print, otherwise print the MCInst.
+  if (InstPrinter)
+    InstPrinter->printInst(&Inst, OS, "");
+  else
+    Inst.print(OS, &MAI);
+  EmitEOL();
+}
+
+/// EmitRawText - If this file is backed by an assembly streamer, this dumps
+/// the specified string in the output .s file.  This capability is
+/// indicated by the hasRawTextSupport() predicate.
+void PTXMCAsmStreamer::EmitRawText(StringRef String) {
+  if (!String.empty() && String.back() == '\n')
+    String = String.substr(0, String.size()-1);
+  OS << String;
+  EmitEOL();
+}
+
+void PTXMCAsmStreamer::FinishImpl() {}
+
+namespace llvm {
+  MCStreamer *createPTXAsmStreamer(MCContext &Context,
+                                   formatted_raw_ostream &OS,
+                                   bool isVerboseAsm, bool useLoc, bool useCFI,
+                                   bool useDwarfDirectory,
+                                   MCInstPrinter *IP,
+                                   MCCodeEmitter *CE, MCAsmBackend *MAB,
+                                   bool ShowInst) {
+    return new PTXMCAsmStreamer(Context, OS, isVerboseAsm, useLoc,
+                                IP, CE, ShowInst);
+  }
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXMCInstLower.cpp b/contrib/llvm/lib/Target/PTX/PTXMCInstLower.cpp
new file mode 100644
index 0000000..142e639
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXMCInstLower.cpp
@@ -0,0 +1,32 @@
+//===-- PTXMCInstLower.cpp - Convert PTX MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower PTX MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTX.h"
+#include "PTXAsmPrinter.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+void llvm::LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        PTXAsmPrinter &AP) {
+  OutMI.setOpcode(MI->getOpcode());
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp;
+    OutMI.addOperand(AP.lowerOperand(MO));
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXMFInfoExtract.cpp b/contrib/llvm/lib/Target/PTX/PTXMFInfoExtract.cpp
new file mode 100644
index 0000000..172a0e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXMFInfoExtract.cpp
@@ -0,0 +1,85 @@
+//===-- PTXMFInfoExtract.cpp - Extract PTX machine function info ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an information extractor for PTX machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-mf-info-extract"
+
+#include "PTX.h"
+#include "PTXTargetMachine.h"
+#include "PTXMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// NOTE: PTXMFInfoExtract must after register allocation!
+
+namespace {
+  /// PTXMFInfoExtract - PTX specific code to extract of PTX machine
+  /// function information for PTXAsmPrinter
+  ///
+  class PTXMFInfoExtract : public MachineFunctionPass {
+    private:
+      static char ID;
+
+    public:
+      PTXMFInfoExtract(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel)
+        : MachineFunctionPass(ID) {}
+
+      virtual bool runOnMachineFunction(MachineFunction &MF);
+
+      virtual const char *getPassName() const {
+        return "PTX Machine Function Info Extractor";
+      }
+  }; // class PTXMFInfoExtract
+} // end anonymous namespace
+
+using namespace llvm;
+
+char PTXMFInfoExtract::ID = 0;
+
+bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) {
+  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Generate list of all virtual registers used in this function
+  for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    const TargetRegisterClass *TRC = MRI.getRegClass(Reg);
+    unsigned RegType;
+    if (TRC == PTX::RegPredRegisterClass)
+      RegType = PTXRegisterType::Pred;
+    else if (TRC == PTX::RegI16RegisterClass)
+      RegType = PTXRegisterType::B16;
+    else if (TRC == PTX::RegI32RegisterClass)
+      RegType = PTXRegisterType::B32;
+    else if (TRC == PTX::RegI64RegisterClass)
+      RegType = PTXRegisterType::B64;
+    else if (TRC == PTX::RegF32RegisterClass)
+      RegType = PTXRegisterType::F32;
+    else if (TRC == PTX::RegF64RegisterClass)
+      RegType = PTXRegisterType::F64;
+    else
+      llvm_unreachable("Unkown register class.");
+    MFI->addRegister(Reg, RegType, PTXRegisterSpace::Reg);
+  }
+
+  return false;
+}
+
+FunctionPass *llvm::createPTXMFInfoExtract(PTXTargetMachine &TM,
+                                           CodeGenOpt::Level OptLevel) {
+  return new PTXMFInfoExtract(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PTX/PTXMachineFunctionInfo.cpp
new file mode 100644
index 0000000..60acfc7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXMachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- PTXMachineFuctionInfo.cpp - PTX machine function info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTXMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void PTXMachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/PTX/PTXMachineFunctionInfo.h b/contrib/llvm/lib/Target/PTX/PTXMachineFunctionInfo.h
new file mode 100644
index 0000000..bb7574c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXMachineFunctionInfo.h
@@ -0,0 +1,202 @@
+//===-- PTXMachineFuctionInfo.h - PTX machine function info ------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares PTX-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_MACHINE_FUNCTION_INFO_H
+#define PTX_MACHINE_FUNCTION_INFO_H
+
+#include "PTX.h"
+#include "PTXParamManager.h"
+#include "PTXRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+/// PTXMachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private PTX target-specific information for each MachineFunction.
+///
+class PTXMachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+  bool IsKernel;
+  DenseSet<unsigned> RegArgs;
+  DenseSet<unsigned> RegRets;
+
+  typedef DenseMap<int, std::string> FrameMap;
+
+  FrameMap FrameSymbols;
+
+  struct RegisterInfo {
+    unsigned Reg;
+    unsigned Type;
+    unsigned Space;
+    unsigned Offset;
+    unsigned Encoded;
+  };
+
+  typedef DenseMap<unsigned, RegisterInfo> RegisterInfoMap;
+
+  RegisterInfoMap RegInfo;
+
+  PTXParamManager ParamManager;
+
+public:
+  typedef DenseSet<unsigned>::const_iterator reg_iterator;
+
+  PTXMachineFunctionInfo(MachineFunction &MF)
+    : IsKernel(false) {
+  }
+
+  /// getParamManager - Returns the PTXParamManager instance for this function.
+  PTXParamManager& getParamManager() { return ParamManager; }
+  const PTXParamManager& getParamManager() const { return ParamManager; }
+
+  /// setKernel/isKernel - Gets/sets a flag that indicates if this function is
+  /// a PTX kernel function.
+  void setKernel(bool _IsKernel=true) { IsKernel = _IsKernel; }
+  bool isKernel() const { return IsKernel; }
+
+  /// argreg_begin/argreg_end - Returns iterators to the set of registers
+  /// containing function arguments.
+  reg_iterator argreg_begin() const { return RegArgs.begin(); }
+  reg_iterator argreg_end()   const { return RegArgs.end(); }
+
+  /// retreg_begin/retreg_end - Returns iterators to the set of registers
+  /// containing the function return values.
+  reg_iterator retreg_begin() const { return RegRets.begin(); }
+  reg_iterator retreg_end()   const { return RegRets.end(); }
+
+  /// addRegister - Adds a virtual register to the set of all used registers
+  void addRegister(unsigned Reg, unsigned RegType, unsigned RegSpace) {
+    if (!RegInfo.count(Reg)) {
+      RegisterInfo Info;
+      Info.Reg = Reg;
+      Info.Type = RegType;
+      Info.Space = RegSpace;
+
+      // Determine register offset
+      Info.Offset = 0;
+      for(RegisterInfoMap::const_iterator i = RegInfo.begin(),
+          e = RegInfo.end(); i != e; ++i) {
+        const RegisterInfo& RI = i->second;
+        if (RI.Space == RegSpace)
+          if (RI.Space != PTXRegisterSpace::Reg || RI.Type == Info.Type)
+            Info.Offset++;
+      }
+
+      // Encode the register data into a single register number
+      Info.Encoded = (Info.Offset << 6) | (Info.Type << 3) | Info.Space;
+
+      RegInfo[Reg] = Info;
+
+      if (RegSpace == PTXRegisterSpace::Argument)
+        RegArgs.insert(Reg);
+      else if (RegSpace == PTXRegisterSpace::Return)
+        RegRets.insert(Reg);
+    }
+  }
+
+  /// countRegisters - Returns the number of registers of the given type and
+  /// space.
+  unsigned countRegisters(unsigned RegType, unsigned RegSpace) const {
+    unsigned Count = 0;
+    for(RegisterInfoMap::const_iterator i = RegInfo.begin(), e = RegInfo.end();
+        i != e; ++i) {
+      const RegisterInfo& RI = i->second;
+      if (RI.Type == RegType && RI.Space == RegSpace)
+        Count++;
+    }
+    return Count;
+  }
+
+  /// getEncodedRegister - Returns the encoded value of the register.
+  unsigned getEncodedRegister(unsigned Reg) const {
+    return RegInfo.lookup(Reg).Encoded;
+  }
+
+  /// addRetReg - Adds a register to the set of return-value registers.
+  void addRetReg(unsigned Reg) {
+    if (!RegRets.count(Reg)) {
+      RegRets.insert(Reg);
+    }
+  }
+
+  /// addArgReg - Adds a register to the set of function argument registers.
+  void addArgReg(unsigned Reg) {
+    RegArgs.insert(Reg);
+  }
+
+  /// getRegisterName - Returns the name of the specified virtual register. This
+  /// name is used during PTX emission.
+  std::string getRegisterName(unsigned Reg) const {
+    if (RegInfo.count(Reg)) {
+      const RegisterInfo& RI = RegInfo.lookup(Reg);
+      std::string Name;
+      raw_string_ostream NameStr(Name);
+      decodeRegisterName(NameStr, RI.Encoded);
+      NameStr.flush();
+      return Name;
+    }
+    else if (Reg == PTX::NoRegister)
+      return "%noreg";
+    else
+      llvm_unreachable("Register not in register name map");
+  }
+
+  /// getEncodedRegisterName - Returns the name of the encoded register.
+  std::string getEncodedRegisterName(unsigned EncodedReg) const {
+    std::string Name;
+    raw_string_ostream NameStr(Name);
+    decodeRegisterName(NameStr, EncodedReg);
+    NameStr.flush();
+    return Name;
+  }
+
+  /// getRegisterType - Returns the type of the specified virtual register.
+  unsigned getRegisterType(unsigned Reg) const {
+    if (RegInfo.count(Reg))
+      return RegInfo.lookup(Reg).Type;
+    else
+      llvm_unreachable("Unknown register");
+  }
+
+  /// getOffsetForRegister - Returns the offset of the virtual register
+  unsigned getOffsetForRegister(unsigned Reg) const {
+    if (RegInfo.count(Reg))
+      return RegInfo.lookup(Reg).Offset;
+    else
+      return 0;
+  }
+
+  /// getFrameSymbol - Returns the symbol name for the given FrameIndex.
+  const char* getFrameSymbol(int FrameIndex) {
+    if (FrameSymbols.count(FrameIndex)) {
+      return FrameSymbols.lookup(FrameIndex).c_str();
+    } else {
+      std::string Name          = "__local";
+      Name                     += utostr(FrameIndex);
+      // The whole point of caching this name is to ensure the pointer we pass
+      // to any getExternalSymbol() calls will remain valid for the lifetime of
+      // the back-end instance. This is to work around an issue in SelectionDAG
+      // where symbol names are expected to be life-long strings.
+      FrameSymbols[FrameIndex]  = Name;
+      return FrameSymbols[FrameIndex].c_str();
+    }
+  }
+}; // class PTXMachineFunctionInfo
+} // namespace llvm
+
+#endif // PTX_MACHINE_FUNCTION_INFO_H
diff --git a/contrib/llvm/lib/Target/PTX/PTXParamManager.cpp b/contrib/llvm/lib/Target/PTX/PTXParamManager.cpp
new file mode 100644
index 0000000..cc1cc71
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXParamManager.cpp
@@ -0,0 +1,73 @@
+//===-- PTXParamManager.cpp - Manager for .param variables ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PTXParamManager class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTXParamManager.h"
+#include "PTX.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace llvm;
+
+PTXParamManager::PTXParamManager() {
+}
+
+unsigned PTXParamManager::addArgumentParam(unsigned Size) {
+  PTXParam Param;
+  Param.Type = PTX_PARAM_TYPE_ARGUMENT;
+  Param.Size = Size;
+
+  std::string Name;
+  Name = "__param_";
+  Name += utostr(ArgumentParams.size()+1);
+  Param.Name = Name;
+
+  unsigned Index = AllParams.size();
+  AllParams[Index] = Param;
+  ArgumentParams.push_back(Index);
+
+  return Index;
+}
+
+unsigned PTXParamManager::addReturnParam(unsigned Size) {
+  PTXParam Param;
+  Param.Type = PTX_PARAM_TYPE_RETURN;
+  Param.Size = Size;
+
+  std::string Name;
+  Name = "__ret_";
+  Name += utostr(ReturnParams.size()+1);
+  Param.Name = Name;
+
+  unsigned Index = AllParams.size();
+  AllParams[Index] = Param;
+  ReturnParams.push_back(Index);
+
+  return Index;
+}
+
+unsigned PTXParamManager::addLocalParam(unsigned Size) {
+  PTXParam Param;
+  Param.Type = PTX_PARAM_TYPE_LOCAL;
+  Param.Size = Size;
+
+  std::string Name;
+  Name = "__localparam_";
+  Name += utostr(LocalParams.size()+1);
+  Param.Name = Name;
+
+  unsigned Index = AllParams.size();
+  AllParams[Index] = Param;
+  LocalParams.push_back(Index);
+
+  return Index;
+}
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXParamManager.h b/contrib/llvm/lib/Target/PTX/PTXParamManager.h
new file mode 100644
index 0000000..92e7728
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXParamManager.h
@@ -0,0 +1,87 @@
+//===-- PTXParamManager.h - Manager for .param variables --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PTXParamManager class, which manages all defined .param
+// variables for a particular function.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_PARAM_MANAGER_H
+#define PTX_PARAM_MANAGER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+
+namespace llvm {
+
+/// PTXParamManager - This class manages all .param variables defined for a
+/// particular function.
+class PTXParamManager {
+private:
+
+  /// PTXParamType - Type of a .param variable
+  enum PTXParamType {
+    PTX_PARAM_TYPE_ARGUMENT,
+    PTX_PARAM_TYPE_RETURN,
+    PTX_PARAM_TYPE_LOCAL
+  };
+
+  /// PTXParam - Definition of a PTX .param variable
+  struct PTXParam {
+    PTXParamType  Type;
+    unsigned      Size;
+    std::string   Name;
+  };
+
+  DenseMap<unsigned, PTXParam> AllParams;
+  SmallVector<unsigned, 4> ArgumentParams;
+  SmallVector<unsigned, 4> ReturnParams;
+  SmallVector<unsigned, 4> LocalParams;
+
+public:
+
+  typedef SmallVector<unsigned, 4>::const_iterator param_iterator;
+
+  PTXParamManager();
+
+  param_iterator arg_begin() const { return ArgumentParams.begin(); }
+  param_iterator arg_end() const { return ArgumentParams.end(); }
+  param_iterator ret_begin() const { return ReturnParams.begin(); }
+  param_iterator ret_end() const { return ReturnParams.end(); }
+  param_iterator local_begin() const { return LocalParams.begin(); }
+  param_iterator local_end() const { return LocalParams.end(); }
+
+  /// addArgumentParam - Returns a new .param used as an argument.
+  unsigned addArgumentParam(unsigned Size);
+
+  /// addReturnParam - Returns a new .param used as a return argument.
+  unsigned addReturnParam(unsigned Size);
+
+  /// addLocalParam - Returns a new .param used as a local .param variable.
+  unsigned addLocalParam(unsigned Size);
+
+  /// getParamName - Returns the name of the parameter as a string.
+  const std::string &getParamName(unsigned Param) const {
+    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
+    return AllParams.find(Param)->second.Name;
+  }
+
+  /// getParamSize - Returns the size of the parameter in bits.
+  unsigned getParamSize(unsigned Param) const {
+    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
+    return AllParams.find(Param)->second.Size;
+  }
+
+};
+
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXRegAlloc.cpp b/contrib/llvm/lib/Target/PTX/PTXRegAlloc.cpp
new file mode 100644
index 0000000..7fd5375
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXRegAlloc.cpp
@@ -0,0 +1,53 @@
+//===-- PTXRegAlloc.cpp - PTX Register Allocator --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a register allocator for PTX code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-reg-alloc"
+
+#include "PTX.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+
+using namespace llvm;
+
+namespace {
+  // Special register allocator for PTX.
+  class PTXRegAlloc : public MachineFunctionPass {
+  public:
+    static char ID;
+    PTXRegAlloc() : MachineFunctionPass(ID) {}
+
+    virtual const char* getPassName() const {
+      return "PTX Register Allocator";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      // We do not actually do anything (at least not yet).
+      return false;
+    }
+  };
+
+  char PTXRegAlloc::ID = 0;
+
+  static RegisterRegAlloc
+    ptxRegAlloc("ptx", "PTX register allocator", createPTXRegisterAllocator);
+}
+
+FunctionPass *llvm::createPTXRegisterAllocator() {
+  return new PTXRegAlloc();
+}
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXRegisterInfo.cpp b/contrib/llvm/lib/Target/PTX/PTXRegisterInfo.cpp
new file mode 100644
index 0000000..b6ffd38
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXRegisterInfo.cpp
@@ -0,0 +1,38 @@
+//===-- PTXRegisterInfo.cpp - PTX Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTXRegisterInfo.h"
+#include "PTX.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "PTXGenRegisterInfo.inc"
+
+using namespace llvm;
+
+PTXRegisterInfo::PTXRegisterInfo(PTXTargetMachine &TM,
+                                 const TargetInstrInfo &tii)
+  // PTX does not have a return address register.
+  : PTXGenRegisterInfo(0), TII(tii) {
+}
+
+void PTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator /*II*/,
+                                          int /*SPAdj*/,
+                                          RegScavenger * /*RS*/) const {
+  llvm_unreachable("FrameIndex should have been previously eliminated!");
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXRegisterInfo.h b/contrib/llvm/lib/Target/PTX/PTXRegisterInfo.h
new file mode 100644
index 0000000..5614ce7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXRegisterInfo.h
@@ -0,0 +1,56 @@
+//===-- PTXRegisterInfo.h - PTX Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PTX implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_REGISTER_INFO_H
+#define PTX_REGISTER_INFO_H
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/BitVector.h"
+
+#define GET_REGINFO_HEADER
+#include "PTXGenRegisterInfo.inc"
+
+namespace llvm {
+class PTXTargetMachine;
+class MachineFunction;
+
+struct PTXRegisterInfo : public PTXGenRegisterInfo {
+private:
+  const TargetInstrInfo &TII;
+
+public:
+  PTXRegisterInfo(PTXTargetMachine &TM,
+                  const TargetInstrInfo &tii);
+
+  virtual const uint16_t
+    *getCalleeSavedRegs(const MachineFunction *MF = 0) const {
+    static const uint16_t CalleeSavedRegs[] = { 0 };
+    return CalleeSavedRegs; // save nothing
+  }
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const {
+    BitVector Reserved(getNumRegs());
+    return Reserved; // reserve no regs
+  }
+
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                   int SPAdj,
+                                   RegScavenger *RS = NULL) const;
+
+  virtual unsigned getFrameRegister(const MachineFunction &MF) const {
+    llvm_unreachable("PTX does not have a frame register");
+  }
+}; // struct PTXRegisterInfo
+} // namespace llvm
+
+#endif // PTX_REGISTER_INFO_H
diff --git a/contrib/llvm/lib/Target/PTX/PTXRegisterInfo.td b/contrib/llvm/lib/Target/PTX/PTXRegisterInfo.td
new file mode 100644
index 0000000..e8b262e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXRegisterInfo.td
@@ -0,0 +1,36 @@
+//===-- PTXRegisterInfo.td - PTX Register defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class PTXReg<string n> : Register<n> {
+  let Namespace = "PTX";
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+// The generated register info code throws warnings for empty register classes
+// (e.g. zero-length arrays), so we use a dummy register here just to prevent
+// these warnings.
+def DUMMY_REG : PTXReg<"R0">;
+
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+def RegPred : RegisterClass<"PTX", [i1], 8, (add DUMMY_REG)>;
+def RegI16 : RegisterClass<"PTX", [i16], 16, (add DUMMY_REG)>;
+def RegI32 : RegisterClass<"PTX", [i32], 32, (add DUMMY_REG)>;
+def RegI64 : RegisterClass<"PTX", [i64], 64, (add DUMMY_REG)>;
+def RegF32 : RegisterClass<"PTX", [f32], 32, (add DUMMY_REG)>;
+def RegF64 : RegisterClass<"PTX", [f64], 64, (add DUMMY_REG)>;
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/PTX/PTXSelectionDAGInfo.cpp
new file mode 100644
index 0000000..a116fab
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXSelectionDAGInfo.cpp
@@ -0,0 +1,150 @@
+//===-- PTXSelectionDAGInfo.cpp - PTX SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PTXSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-selectiondag-info"
+#include "PTXTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+using namespace llvm;
+
+PTXSelectionDAGInfo::PTXSelectionDAGInfo(const TargetMachine &TM)
+  : TargetSelectionDAGInfo(TM),
+    Subtarget(&TM.getSubtarget<PTXSubtarget>()) {
+}
+
+PTXSelectionDAGInfo::~PTXSelectionDAGInfo() {
+}
+
+SDValue
+PTXSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                             SDValue Chain,
+                                             SDValue Dst, SDValue Src,
+                                             SDValue Size, unsigned Align,
+                                             bool isVolatile, bool AlwaysInline,
+                                             MachinePointerInfo DstPtrInfo,
+                                          MachinePointerInfo SrcPtrInfo) const {
+  // Do repeated 4-byte loads and stores. To be improved.
+  // This requires 4-byte alignment.
+  if ((Align & 3) != 0)
+    return SDValue();
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  // Always inline memcpys. In PTX, we do not have a C library that provides
+  // a memcpy function.
+  //if (!AlwaysInline)
+  //  return SDValue();
+
+  unsigned BytesLeft = SizeVal & 3;
+  unsigned NumMemOps = SizeVal >> 2;
+  unsigned EmittedNumMemOps = 0;
+  EVT VT = MVT::i32;
+  unsigned VTSize = 4;
+  unsigned i = 0;
+  const unsigned MAX_LOADS_IN_LDM = 6;
+  SDValue TFOps[MAX_LOADS_IN_LDM];
+  SDValue Loads[MAX_LOADS_IN_LDM];
+  uint64_t SrcOff = 0, DstOff = 0;
+  EVT PointerType = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
+  // same number of stores.  The loads and stores will get combined into
+  // ldm/stm later on.
+  while (EmittedNumMemOps < NumMemOps) {
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      Loads[i] = DAG.getLoad(VT, dl, Chain,
+                             DAG.getNode(ISD::ADD, dl, PointerType, Src,
+                                         DAG.getConstant(SrcOff, PointerType)),
+                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
+                             false, false, 0);
+      TFOps[i] = Loads[i].getValue(1);
+      SrcOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                              DAG.getNode(ISD::ADD, dl, PointerType, Dst,
+                                          DAG.getConstant(DstOff, PointerType)),
+                              DstPtrInfo.getWithOffset(DstOff),
+                              isVolatile, false, 0);
+      DstOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    EmittedNumMemOps += i;
+  }
+
+  if (BytesLeft == 0)
+    return Chain;
+
+  // Issue loads / stores for the trailing (1 - 3) bytes.
+  unsigned BytesLeftSave = BytesLeft;
+  i = 0;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    Loads[i] = DAG.getLoad(VT, dl, Chain,
+                           DAG.getNode(ISD::ADD, dl, PointerType, Src,
+                                       DAG.getConstant(SrcOff, PointerType)),
+                           SrcPtrInfo.getWithOffset(SrcOff), false, false,
+                           false, 0);
+    TFOps[i] = Loads[i].getValue(1);
+    ++i;
+    SrcOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+  i = 0;
+  BytesLeft = BytesLeftSave;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                            DAG.getNode(ISD::ADD, dl, PointerType, Dst,
+                                        DAG.getConstant(DstOff, PointerType)),
+                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
+    ++i;
+    DstOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+}
+
+SDValue PTXSelectionDAGInfo::
+EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                        SDValue Chain, SDValue Dst,
+                        SDValue Src, SDValue Size,
+                        unsigned Align, bool isVolatile,
+                        MachinePointerInfo DstPtrInfo) const {
+  llvm_unreachable("memset lowering not implemented for PTX yet");
+}
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXSelectionDAGInfo.h b/contrib/llvm/lib/Target/PTX/PTXSelectionDAGInfo.h
new file mode 100644
index 0000000..e0c7167
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXSelectionDAGInfo.h
@@ -0,0 +1,53 @@
+//===-- PTXSelectionDAGInfo.h - PTX SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PTX subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXSELECTIONDAGINFO_H
+#define PTXSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+/// PTXSelectionDAGInfo - TargetSelectionDAGInfo sub-class for the PTX target.
+/// At the moment, this is mostly just a copy of ARMSelectionDAGInfo.
+class PTXSelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the PTXSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const PTXSubtarget *Subtarget;
+
+public:
+  explicit PTXSelectionDAGInfo(const TargetMachine &TM);
+  ~PTXSelectionDAGInfo();
+
+  virtual
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const;
+
+  virtual
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const;
+};
+
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PTX/PTXSubtarget.cpp b/contrib/llvm/lib/Target/PTX/PTXSubtarget.cpp
new file mode 100644
index 0000000..454f64e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXSubtarget.cpp
@@ -0,0 +1,68 @@
+//===-- PTXSubtarget.cpp - PTX Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PTX specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTXSubtarget.h"
+#include "PTX.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "PTXGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+void PTXSubtarget::anchor() { }
+
+PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, bool is64Bit)
+  : PTXGenSubtargetInfo(TT, CPU, FS),
+    PTXTarget(PTX_COMPUTE_1_0),
+    PTXVersion(PTX_VERSION_2_0),
+    SupportsDouble(false),
+    SupportsFMA(true),
+    Is64Bit(is64Bit) {
+  std::string TARGET = CPU;
+  if (TARGET.empty())
+    TARGET = "generic";
+  ParseSubtargetFeatures(TARGET, FS);
+}
+
+std::string PTXSubtarget::getTargetString() const {
+  switch(PTXTarget) {
+    default: llvm_unreachable("Unknown PTX target");
+    case PTX_SM_1_0: return "sm_10";
+    case PTX_SM_1_1: return "sm_11";
+    case PTX_SM_1_2: return "sm_12";
+    case PTX_SM_1_3: return "sm_13";
+    case PTX_SM_2_0: return "sm_20";
+    case PTX_SM_2_1: return "sm_21";
+    case PTX_SM_2_2: return "sm_22";
+    case PTX_SM_2_3: return "sm_23";
+    case PTX_COMPUTE_1_0: return "compute_10";
+    case PTX_COMPUTE_1_1: return "compute_11";
+    case PTX_COMPUTE_1_2: return "compute_12";
+    case PTX_COMPUTE_1_3: return "compute_13";
+    case PTX_COMPUTE_2_0: return "compute_20";
+  }
+}
+
+std::string PTXSubtarget::getPTXVersionString() const {
+  switch(PTXVersion) {
+    case PTX_VERSION_2_0: return "2.0";
+    case PTX_VERSION_2_1: return "2.1";
+    case PTX_VERSION_2_2: return "2.2";
+    case PTX_VERSION_2_3: return "2.3";
+  }
+  llvm_unreachable("Invalid PTX version");
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXSubtarget.h b/contrib/llvm/lib/Target/PTX/PTXSubtarget.h
new file mode 100644
index 0000000..ce93fef
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXSubtarget.h
@@ -0,0 +1,131 @@
+//===-- PTXSubtarget.h - Define Subtarget for the PTX -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PTX specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_SUBTARGET_H
+#define PTX_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "PTXGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+  class PTXSubtarget : public PTXGenSubtargetInfo {
+      virtual void anchor(); 
+    public:
+
+      /**
+       * Enumeration of Shader Models supported by the back-end.
+       */
+      enum PTXTargetEnum {
+        PTX_COMPUTE_1_0, /*< Compute Compatibility 1.0 */
+        PTX_COMPUTE_1_1, /*< Compute Compatibility 1.1 */
+        PTX_COMPUTE_1_2, /*< Compute Compatibility 1.2 */
+        PTX_COMPUTE_1_3, /*< Compute Compatibility 1.3 */
+        PTX_COMPUTE_2_0, /*< Compute Compatibility 2.0 */
+        PTX_LAST_COMPUTE,
+
+        PTX_SM_1_0, /*< Shader Model 1.0 */
+        PTX_SM_1_1, /*< Shader Model 1.1 */
+        PTX_SM_1_2, /*< Shader Model 1.2 */
+        PTX_SM_1_3, /*< Shader Model 1.3 */
+        PTX_SM_2_0, /*< Shader Model 2.0 */
+        PTX_SM_2_1, /*< Shader Model 2.1 */
+        PTX_SM_2_2, /*< Shader Model 2.2 */
+        PTX_SM_2_3, /*< Shader Model 2.3 */
+        PTX_LAST_SM
+      };
+
+      /**
+       * Enumeration of PTX versions supported by the back-end.
+       *
+       * Currently, PTX 2.0 is the minimum supported version.
+       */
+      enum PTXVersionEnum {
+        PTX_VERSION_2_0,  /*< PTX Version 2.0 */
+        PTX_VERSION_2_1,  /*< PTX Version 2.1 */
+        PTX_VERSION_2_2,  /*< PTX Version 2.2 */
+        PTX_VERSION_2_3   /*< PTX Version 2.3 */
+      };
+
+  private:
+
+      /// Shader Model supported on the target GPU.
+      PTXTargetEnum PTXTarget;
+
+      /// PTX Language Version.
+      PTXVersionEnum PTXVersion;
+
+      // The native .f64 type is supported on the hardware.
+      bool SupportsDouble;
+
+      // Support the fused-multiply add (FMA) and multiply-add (MAD)
+      // instructions
+      bool SupportsFMA;
+
+      // Use .u64 instead of .u32 for addresses.
+      bool Is64Bit;
+
+    public:
+
+      PTXSubtarget(const std::string &TT, const std::string &CPU,
+                   const std::string &FS, bool is64Bit);
+
+      // Target architecture accessors
+      std::string getTargetString() const;
+
+      std::string getPTXVersionString() const;
+
+      bool supportsDouble() const { return SupportsDouble; }
+
+      bool is64Bit() const { return Is64Bit; }
+
+      bool supportsFMA() const { return SupportsFMA; }
+
+      bool supportsPTX21() const { return PTXVersion >= PTX_VERSION_2_1; }
+
+      bool supportsPTX22() const { return PTXVersion >= PTX_VERSION_2_2; }
+
+      bool supportsPTX23() const { return PTXVersion >= PTX_VERSION_2_3; }
+
+      bool fdivNeedsRoundingMode() const {
+        return (PTXTarget >= PTX_SM_1_3 && PTXTarget < PTX_LAST_SM) ||
+               (PTXTarget >= PTX_COMPUTE_1_3 && PTXTarget < PTX_LAST_COMPUTE);
+      }
+
+      bool fmadNeedsRoundingMode() const {
+        return (PTXTarget >= PTX_SM_1_3 && PTXTarget < PTX_LAST_SM) ||
+               (PTXTarget >= PTX_COMPUTE_1_3 && PTXTarget < PTX_LAST_COMPUTE);
+      }
+
+      bool useParamSpaceForDeviceArgs() const {
+        return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) ||
+               (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
+      }
+
+      bool callsAreHandled() const {
+        return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) ||
+               (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
+      }
+
+      bool emitPtrAttribute() const {
+        return PTXVersion >= PTX_VERSION_2_2;
+      }
+
+      void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  }; // class PTXSubtarget
+} // namespace llvm
+
+#endif // PTX_SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/PTX/PTXTargetMachine.cpp b/contrib/llvm/lib/Target/PTX/PTXTargetMachine.cpp
new file mode 100644
index 0000000..97b8de1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXTargetMachine.cpp
@@ -0,0 +1,165 @@
+//===-- PTXTargetMachine.cpp - Define TargetMachine for PTX ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTXTargetMachine.h"
+#include "PTX.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+
+
+using namespace llvm;
+
+namespace llvm {
+  MCStreamer *createPTXAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                                   bool isVerboseAsm, bool useLoc,
+                                   bool useCFI, bool useDwarfDirectory,
+                                   MCInstPrinter *InstPrint,
+                                   MCCodeEmitter *CE,
+                                   MCAsmBackend *MAB,
+                                   bool ShowInst);
+}
+
+extern "C" void LLVMInitializePTXTarget() {
+
+  RegisterTargetMachine<PTX32TargetMachine> X(ThePTX32Target);
+  RegisterTargetMachine<PTX64TargetMachine> Y(ThePTX64Target);
+
+  TargetRegistry::RegisterAsmStreamer(ThePTX32Target, createPTXAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePTX64Target, createPTXAsmStreamer);
+}
+
+namespace {
+  const char* DataLayout32 =
+    "e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
+  const char* DataLayout64 =
+    "e-p:64:64-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
+}
+
+// DataLayout and FrameLowering are filled with dummy data
+PTXTargetMachine::PTXTargetMachine(const Target &T,
+                                   StringRef TT, StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL,
+                                   bool is64Bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    DataLayout(is64Bit ? DataLayout64 : DataLayout32),
+    Subtarget(TT, CPU, FS, is64Bit),
+    FrameLowering(Subtarget),
+    InstrInfo(*this),
+    TSInfo(*this),
+    TLInfo(*this) {
+}
+
+void PTX32TargetMachine::anchor() { }
+
+PTX32TargetMachine::PTX32TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
+}
+
+void PTX64TargetMachine::anchor() { }
+
+PTX64TargetMachine::PTX64TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
+}
+
+namespace llvm {
+/// PTX Code Generator Pass Configuration Options.
+class PTXPassConfig : public TargetPassConfig {
+public:
+  PTXPassConfig(PTXTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  PTXTargetMachine &getPTXTargetMachine() const {
+      return getTM<PTXTargetMachine>();
+  }
+
+  bool addInstSelector();
+  FunctionPass *createTargetRegisterAllocator(bool);
+  void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
+  bool addPostRegAlloc();
+  void addMachineLateOptimization();
+  bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *PTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+  PTXPassConfig *PassConfig = new PTXPassConfig(this, PM);
+  PassConfig->disablePass(PrologEpilogCodeInserterID);
+  return PassConfig;
+}
+
+bool PTXPassConfig::addInstSelector() {
+  PM->add(createPTXISelDag(getPTXTargetMachine(), getOptLevel()));
+  return false;
+}
+
+FunctionPass *PTXPassConfig::createTargetRegisterAllocator(bool /*Optimized*/) {
+  return createPTXRegisterAllocator();
+}
+
+// Modify the optimized compilation path to bypass optimized register alloction.
+void PTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  addFastRegAlloc(RegAllocPass);
+}
+
+bool PTXPassConfig::addPostRegAlloc() {
+  // PTXMFInfoExtract must after register allocation!
+  //PM->add(createPTXMFInfoExtract(getPTXTargetMachine()));
+  return false;
+}
+
+/// Add passes that optimize machine instructions after register allocation.
+void PTXPassConfig::addMachineLateOptimization() {
+  if (addPass(BranchFolderPassID) != &NoPassID)
+    printAndVerify("After BranchFolding");
+
+  if (addPass(TailDuplicateID) != &NoPassID)
+    printAndVerify("After TailDuplicate");
+}
+
+bool PTXPassConfig::addPreEmitPass() {
+  PM->add(createPTXMFInfoExtract(getPTXTargetMachine(), getOptLevel()));
+  PM->add(createPTXFPRoundingModePass(getPTXTargetMachine(), getOptLevel()));
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/PTX/PTXTargetMachine.h b/contrib/llvm/lib/Target/PTX/PTXTargetMachine.h
new file mode 100644
index 0000000..278d155
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/PTXTargetMachine.h
@@ -0,0 +1,104 @@
+//===-- PTXTargetMachine.h - Define TargetMachine for PTX -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PTX specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_TARGET_MACHINE_H
+#define PTX_TARGET_MACHINE_H
+
+#include "PTXISelLowering.h"
+#include "PTXInstrInfo.h"
+#include "PTXFrameLowering.h"
+#include "PTXSelectionDAGInfo.h"
+#include "PTXSubtarget.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class PTXTargetMachine : public LLVMTargetMachine {
+  private:
+    const TargetData    DataLayout;
+    PTXSubtarget        Subtarget; // has to be initialized before FrameLowering
+    PTXFrameLowering    FrameLowering;
+    PTXInstrInfo        InstrInfo;
+    PTXSelectionDAGInfo TSInfo;
+    PTXTargetLowering   TLInfo;
+
+  public:
+    PTXTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL,
+                     bool is64Bit);
+
+    virtual const TargetData *getTargetData() const { return &DataLayout; }
+
+    virtual const TargetFrameLowering *getFrameLowering() const {
+      return &FrameLowering;
+    }
+
+    virtual const PTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
+    virtual const TargetRegisterInfo *getRegisterInfo() const {
+      return &InstrInfo.getRegisterInfo(); }
+
+    virtual const PTXTargetLowering *getTargetLowering() const {
+      return &TLInfo; }
+
+    virtual const PTXSelectionDAGInfo* getSelectionDAGInfo() const {
+      return &TSInfo;
+    }
+
+    virtual const PTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
+
+    // Emission of machine code through JITCodeEmitter is not supported.
+    virtual bool addPassesToEmitMachineCode(PassManagerBase &,
+                                            JITCodeEmitter &,
+                                            bool = true) {
+      return true;
+    }
+
+    // Emission of machine code through MCJIT is not supported.
+    virtual bool addPassesToEmitMC(PassManagerBase &,
+                                   MCContext *&,
+                                   raw_ostream &,
+                                   bool = true) {
+      return true;
+    }
+
+    // Pass Pipeline Configuration
+    virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+}; // class PTXTargetMachine
+
+
+class PTX32TargetMachine : public PTXTargetMachine {
+  virtual void anchor();
+public:
+
+  PTX32TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+}; // class PTX32TargetMachine
+
+class PTX64TargetMachine : public PTXTargetMachine {
+  virtual void anchor();
+public:
+
+  PTX64TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+}; // class PTX32TargetMachine
+
+} // namespace llvm
+
+#endif // PTX_TARGET_MACHINE_H
diff --git a/contrib/llvm/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/contrib/llvm/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
new file mode 100644
index 0000000..09a2735
--- /dev/null
+++ b/contrib/llvm/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
@@ -0,0 +1,25 @@
+//===-- PTXTargetInfo.cpp - PTX Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTX.h"
+#include "llvm/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target llvm::ThePTX32Target;
+Target llvm::ThePTX64Target;
+
+extern "C" void LLVMInitializePTXTargetInfo() {
+  // see llvm/ADT/Triple.h
+  RegisterTarget<Triple::ptx32> X32(ThePTX32Target, "ptx32",
+                                    "PTX (32-bit) [Experimental]");
+  RegisterTarget<Triple::ptx64> X64(ThePTX64Target, "ptx64",
+                                    "PTX (64-bit) [Experimental]");
+}