11 files changed, 103 insertions, 38 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 709d753..0a5309b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -264,6 +264,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
       // TODO: CodeSize should account for multiple functions.
+
+      // TODO: Should we count size of debug info?
+      if (MI.isDebugValue())
+        continue;
+
+      // FIXME: This is reporting 0 for many instructions.
       CodeSize += MI.getDesc().Size;
 
       unsigned numOperands = MI.getNumOperands();
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 4a65bfc..57b7a73 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -134,13 +134,17 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 //
 // TODO: Check isTriviallyVectorizable for calls and handle other
 // instructions.
-static bool canVectorizeInst(Instruction *Inst) {
+static bool canVectorizeInst(Instruction *Inst, User *User) {
   switch (Inst->getOpcode()) {
   case Instruction::Load:
-  case Instruction::Store:
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
     return true;
+  case Instruction::Store: {
+    // Must be the stored pointer operand, not a stored value.
+    StoreInst *SI = cast<StoreInst>(Inst);
+    return SI->getPointerOperand() == User;
+  }
   default:
     return false;
   }
@@ -166,7 +170,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
   for (User *AllocaUser : Alloca->users()) {
     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
     if (!GEP) {
-      if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
+      if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
         return false;
 
       WorkList.push_back(AllocaUser);
@@ -184,7 +188,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
 
     GEPVectorIdx[GEP] = Index;
     for (User *GEPUser : AllocaUser->users()) {
-      if (!canVectorizeInst(cast<Instruction>(GEPUser)))
+      if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
         return false;
 
       WorkList.push_back(GEPUser);
@@ -240,7 +244,12 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
   for (User *User : Val->users()) {
     if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
       continue;
-    if (isa<CallInst>(User)) {
+    if (CallInst *CI = dyn_cast<CallInst>(User)) {
+      // TODO: We might be able to handle some cases where the callee is a
+      // constantexpr bitcast of a function.
+      if (!CI->getCalledFunction())
+        return false;
+
       WorkList.push_back(User);
       continue;
     }
@@ -250,6 +259,12 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
     if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
       return false;
 
+    if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
+      // Reject if the stored value is not the pointer operand.
+      if (SI->getPointerOperand() != Val)
+        return false;
+    }
+
     if (!User->getType()->isPointerTy())
       continue;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
index 835a146..ba0490a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -14,8 +14,7 @@
 let Namespace = "AMDGPU" in {
 
 foreach Index = 0-15 in {
-  // Indices are used in a variety of ways here, so don't set a size/offset.
-  def sub#Index : SubRegIndex<-1, -1>;
+  def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
 }
 
 def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 468563c..4434d9b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -71,12 +71,26 @@ void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm,
   }
 }
 
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+    return 2;
+  case FK_Data_4:
+    return 4;
+  case FK_Data_8:
+    return 8;
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  }
+}
+
 void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                   unsigned DataSize, uint64_t Value,
                                   bool IsPCRel) const {
 
   switch ((unsigned)Fixup.getKind()) {
-    default: llvm_unreachable("Unknown fixup kind");
     case AMDGPU::fixup_si_sopp_br: {
       uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
       *Dst = (Value - 4) / 4;
@@ -96,6 +110,24 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
       *Dst = Value + 4;
       break;
     }
+    default: {
+      // FIXME: Copied from AArch64
+      unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+      if (!Value)
+        return; // Doesn't change encoding.
+      MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+
+      // Shift the value into position.
+      Value <<= Info.TargetOffset;
+
+      unsigned Offset = Fixup.getOffset();
+      assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+      // For each byte of the fragment that the fixup touches, mask in the
+      // bits from the fixup value.
+      for (unsigned i = 0; i != NumBytes; ++i)
+        Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+    }
   }
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 099b0b1..c2db9ff 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -157,6 +157,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
 
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
 
   setOperationAction(ISD::LOAD, MVT::i1, Custom);
@@ -2252,10 +2253,8 @@ MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
                                                   SDValue Ptr) const {
   const SIInstrInfo *TII =
       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-  uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
-                  0xffffffff; // Size
 
-  return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
+  return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
 }
 
 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1891061..cfd2c42 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2778,3 +2778,16 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
 
   return RsrcDataFormat;
 }
+
+uint64_t SIInstrInfo::getScratchRsrcWords23() const {
+  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
+                    AMDGPU::RSRC_TID_ENABLE |
+                    0xffffffff; // Size;
+
+  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
+  // Clear them unless we want a huge stride.
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
+
+  return Rsrc23;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 015ea12..5053786 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -353,7 +353,7 @@ public:
   }
 
   uint64_t getDefaultRsrcDataFormat() const;
-
+  uint64_t getScratchRsrcWords23() const;
 };
 
 namespace AMDGPU {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index f78ffd7..e0eeea9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1548,6 +1548,12 @@ defm V_WRITELANE_B32 : VOP2SI_3VI_m <
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 
+let isCommutable = 1 in {
+defm V_MAC_LEGACY_F32 : VOP2InstSI <vop2<0x6>, "v_mac_legacy_f32",
+  VOP_F32_F32_F32
+>;
+} // End isCommutable = 1
+
 defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32",
   VOP_F32_F32_F32, AMDGPUfmin_legacy
 >;
@@ -1562,12 +1568,6 @@ defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>;
 } // End isCommutable = 1
 } // End let SubtargetPredicate = SICI
 
-let isCommutable = 1 in {
-defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32",
-  VOP_F32_F32_F32
->;
-} // End isCommutable = 1
-
 defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32",
   VOP_I32_I32_I32
 >;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
index 0a7f684..2cd600d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
@@ -135,8 +135,7 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
       unsigned ScratchRsrcReg =
           RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
 
-      uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
-                      0xffffffff; // Size
+      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
 
       unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
       unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
@@ -152,11 +151,11 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
               .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 
       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
-              .addImm(Rsrc & 0xffffffff)
+              .addImm(Rsrc23 & 0xffffffff)
               .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 
       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
-              .addImm(Rsrc >> 32)
+              .addImm(Rsrc23 >> 32)
               .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 
       // Scratch Offset
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 54c4d54..e9e8412 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -26,23 +26,25 @@ using namespace llvm;
 
 SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {}
 
-BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  BitVector Reserved(getNumRegs());
-  Reserved.set(AMDGPU::EXEC);
+void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+  MCRegAliasIterator R(Reg, this, true);
 
-  // EXEC_LO and EXEC_HI could be allocated and used as regular register,
-  // but this seems likely to result in bugs, so I'm marking them as reserved.
-  Reserved.set(AMDGPU::EXEC_LO);
-  Reserved.set(AMDGPU::EXEC_HI);
+  for (; R.isValid(); ++R)
+    Reserved.set(*R);
+}
 
+BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-  Reserved.set(AMDGPU::FLAT_SCR);
-  Reserved.set(AMDGPU::FLAT_SCR_LO);
-  Reserved.set(AMDGPU::FLAT_SCR_HI);
+
+  // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
+  // this seems likely to result in bugs, so I'm marking them as reserved.
+  reserveRegisterTuples(Reserved, AMDGPU::EXEC);
+  reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
   // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
-  Reserved.set(AMDGPU::VGPR255);
-  Reserved.set(AMDGPU::VGPR254);
+  reserveRegisterTuples(Reserved, AMDGPU::VGPR254);
+  reserveRegisterTuples(Reserved, AMDGPU::VGPR255);
 
   // Tonga and Iceland can only allocate a fixed number of SGPRs due
   // to a hw bug.
@@ -54,10 +56,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
     for (unsigned i = Limit; i < NumSGPRs; ++i) {
       unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
-      MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true);
-
-      for (; R.isValid(); ++R)
-        Reserved.set(*R);
+      reserveRegisterTuples(Reserved, Reg);
     }
   }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index bfdb67c..7da6de2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -23,7 +23,10 @@
 namespace llvm {
 
 struct SIRegisterInfo : public AMDGPURegisterInfo {
+private:
+  void reserveRegisterTuples(BitVector &, unsigned Reg) const;
 
+public:
   SIRegisterInfo();
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;