From d686ff024731a6edac15edc457c494395b960c32 Mon Sep 17 00:00:00 2001 From: dim Date: Sat, 16 Jan 2016 17:17:12 +0000 Subject: Vendor import of llvm release_38 branch r257836: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257836 --- CMakeLists.txt | 2 +- autoconf/configure.ac | 4 +- configure | 20 +++++----- docs/ReleaseNotes.rst | 11 ++++++ include/llvm/CodeGen/MachineFrameInfo.h | 14 +++++++ include/llvm/LinkAllPasses.h | 14 ++++--- include/llvm/Target/TargetLowering.h | 2 +- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 2 +- lib/Target/AMDGPU/AMDGPU.h | 2 +- lib/Target/X86/X86FrameLowering.cpp | 13 ++++--- lib/Target/X86/X86ISelLowering.cpp | 2 +- test/CodeGen/X86/x86-repmov-copy-eflags.ll | 53 +++++++++++++++++++++++++++ utils/release/test-release.sh | 6 +++ 13 files changed, 116 insertions(+), 29 deletions(-) create mode 100644 test/CodeGen/X86/x86-repmov-copy-eflags.ll diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d2093f..4dd43e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,7 @@ if(NOT DEFINED LLVM_VERSION_PATCH) set(LLVM_VERSION_PATCH 0) endif() if(NOT DEFINED LLVM_VERSION_SUFFIX) - set(LLVM_VERSION_SUFFIX svn) + set(LLVM_VERSION_SUFFIX "") endif() if (POLICY CMP0048) diff --git a/autoconf/configure.ac b/autoconf/configure.ac index 02ab161e..8d0ae00 100644 --- a/autoconf/configure.ac +++ b/autoconf/configure.ac @@ -32,12 +32,12 @@ dnl===-----------------------------------------------------------------------=== dnl Initialize autoconf and define the package name, version number and dnl address for reporting bugs. -AC_INIT([LLVM],[3.8.0svn],[http://llvm.org/bugs/]) +AC_INIT([LLVM],[3.8.0],[http://llvm.org/bugs/]) LLVM_VERSION_MAJOR=3 LLVM_VERSION_MINOR=8 LLVM_VERSION_PATCH=0 -LLVM_VERSION_SUFFIX=svn +LLVM_VERSION_SUFFIX= AC_DEFINE_UNQUOTED([LLVM_VERSION_MAJOR], $LLVM_VERSION_MAJOR, [Major version of the LLVM API]) AC_DEFINE_UNQUOTED([LLVM_VERSION_MINOR], $LLVM_VERSION_MINOR, [Minor version of the LLVM API]) diff --git a/configure b/configure index 33438c6..c94fb13 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.60 for LLVM 3.8.0svn. +# Generated by GNU Autoconf 2.60 for LLVM 3.8.0. # # Report bugs to . # @@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh} # Identity of this package. PACKAGE_NAME='LLVM' PACKAGE_TARNAME='llvm' -PACKAGE_VERSION='3.8.0svn' -PACKAGE_STRING='LLVM 3.8.0svn' +PACKAGE_VERSION='3.8.0' +PACKAGE_STRING='LLVM 3.8.0' PACKAGE_BUGREPORT='http://llvm.org/bugs/' ac_unique_file="lib/IR/Module.cpp" @@ -1334,7 +1334,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures LLVM 3.8.0svn to adapt to many kinds of systems. +\`configure' configures LLVM 3.8.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1400,7 +1400,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of LLVM 3.8.0svn:";; + short | recursive ) echo "Configuration of LLVM 3.8.0:";; esac cat <<\_ACEOF @@ -1584,7 +1584,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -LLVM configure 3.8.0svn +LLVM configure 3.8.0 generated by GNU Autoconf 2.60 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, @@ -1600,7 +1600,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by LLVM $as_me 3.8.0svn, which was +It was created by LLVM $as_me 3.8.0, which was generated by GNU Autoconf 2.60. Invocation command line was $ $0 $@ @@ -1957,7 +1957,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu LLVM_VERSION_MAJOR=3 LLVM_VERSION_MINOR=8 LLVM_VERSION_PATCH=0 -LLVM_VERSION_SUFFIX=svn +LLVM_VERSION_SUFFIX= cat >>confdefs.h <<_ACEOF @@ -18279,7 +18279,7 @@ exec 6>&1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by LLVM $as_me 3.8.0svn, which was +This file was extended by LLVM $as_me 3.8.0, which was generated by GNU Autoconf 2.60. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -18332,7 +18332,7 @@ Report bugs to ." _ACEOF cat >>$CONFIG_STATUS <<_ACEOF ac_cs_version="\\ -LLVM config.status 3.8.0svn +LLVM config.status 3.8.0 configured by $0, generated by GNU Autoconf 2.60, with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index b3f7c00..dccb7f4 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -68,6 +68,17 @@ Non-comprehensive list of changes in this release Core.h so nothing should change for projects directly including the headers, but transitive dependencies may be affected. +* llvm-ar now suports thin archives. + +* llvm doesn't produce .data.rel.ro.local or .data.rel sections anymore. + +* aliases to available_externally globals are now rejected by the verifier. + +* the IR Linker has been split into IRMover that moves bits from one module to + another and Linker proper that decides what to link. + +* Support for dematerializing has been dropped. + .. NOTE For small 1-3 sentence descriptions, just add an entry at the end of this list. If your description won't fit comfortably in one bullet diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h index 48e8ca7..e50779a 100644 --- a/include/llvm/CodeGen/MachineFrameInfo.h +++ b/include/llvm/CodeGen/MachineFrameInfo.h @@ -251,6 +251,10 @@ class MachineFrameInfo { /// opaque mechanism like inline assembly or Win32 EH. bool HasOpaqueSPAdjustment; + /// True if the function contains operations which will lower down to + /// instructions which manipulate the stack pointer. + bool HasCopyImplyingStackAdjustment; + /// True if the function contains a call to the llvm.vastart intrinsic. bool HasVAStart; @@ -288,6 +292,7 @@ public: LocalFrameMaxAlign = 0; UseLocalStackAllocationBlock = false; HasOpaqueSPAdjustment = false; + HasCopyImplyingStackAdjustment = false; HasVAStart = false; HasMustTailInVarArgFunc = false; Save = nullptr; @@ -493,6 +498,15 @@ public: bool hasOpaqueSPAdjustment() const { return HasOpaqueSPAdjustment; } void setHasOpaqueSPAdjustment(bool B) { HasOpaqueSPAdjustment = B; } + /// Returns true if the function contains operations which will lower down to + /// instructions which manipulate the stack pointer. + bool hasCopyImplyingStackAdjustment() const { + return HasCopyImplyingStackAdjustment; + } + void setHasCopyImplyingStackAdjustment(bool B) { + HasCopyImplyingStackAdjustment = B; + } + /// Returns true if the function calls the llvm.va_start intrinsic. bool hasVAStart() const { return HasVAStart; } void setHasVAStart(bool B) { HasVAStart = B; } diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index d695d11..327faac 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -160,9 +160,11 @@ namespace { (void) llvm::createPostOrderFunctionAttrsPass(); (void) llvm::createReversePostOrderFunctionAttrsPass(); (void) llvm::createMergeFunctionsPass(); - (void) llvm::createPrintModulePass(*(llvm::raw_ostream*)nullptr); - (void) llvm::createPrintFunctionPass(*(llvm::raw_ostream*)nullptr); - (void) llvm::createPrintBasicBlockPass(*(llvm::raw_ostream*)nullptr); + std::string buf; + llvm::raw_string_ostream os(buf); + (void) llvm::createPrintModulePass(os); + (void) llvm::createPrintFunctionPass(os); + (void) llvm::createPrintBasicBlockPass(os); (void) llvm::createModuleDebugInfoPrinterPass(); (void) llvm::createPartialInliningPass(); (void) llvm::createLintPass(); @@ -186,10 +188,10 @@ namespace { (void)new llvm::IntervalPartition(); (void)new llvm::ScalarEvolutionWrapperPass(); - ((llvm::Function*)nullptr)->viewCFGOnly(); + llvm::Function::Create(nullptr, llvm::GlobalValue::ExternalLinkage)->viewCFGOnly(); llvm::RGPassManager RGM; - ((llvm::RegionPass*)nullptr)->runOnRegion((llvm::Region*)nullptr, RGM); - llvm::AliasSetTracker X(*(llvm::AliasAnalysis*)nullptr); + llvm::AliasAnalysis AA; + llvm::AliasSetTracker X(AA); X.add(nullptr, 0, llvm::AAMDNodes()); // for -print-alias-sets (void) llvm::AreStatisticsEnabled(); (void) llvm::sys::RunningOnValgrind(); diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 863b7cd..304da4f 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -2270,7 +2270,7 @@ public: } /// Return true if the MachineFunction contains a COPY which would imply - /// HasOpaqueSPAdjustment. + /// HasCopyImplyingStackAdjustment. virtual bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const { return false; } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 9f8759d..c075da4 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -634,7 +634,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { } if (TLI->hasCopyImplyingStackAdjustment(MF)) - MFI->setHasOpaqueSPAdjustment(true); + MFI->setHasCopyImplyingStackAdjustment(true); // Freeze the set of reserved registers now that MachineFrameInfo has been // set up. All the information required by getReservedRegs() should be diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 5d00e1c..4f718e1 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -20,7 +20,7 @@ class AMDGPUInstrPrinter; class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; -class MachineSchedContext; +struct MachineSchedContext; class MCAsmInfo; class raw_ostream; class ScheduleDAGInstrs; diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 8b5fd27..8632bb8 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -91,7 +91,8 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() || MF.getInfo()->getForceFramePointer() || MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() || - MFI->hasStackMap() || MFI->hasPatchPoint()); + MFI->hasStackMap() || MFI->hasPatchPoint() || + MFI->hasCopyImplyingStackAdjustment()); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { @@ -943,11 +944,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // push and pop from the stack. if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && !TRI->needsStackRealignment(MF) && - !MFI->hasVarSizedObjects() && // No dynamic alloca. - !MFI->adjustsStack() && // No calls. - !IsWin64CC && // Win64 has no Red Zone - !MFI->hasOpaqueSPAdjustment() && // Don't push and pop. - !MF.shouldSplitStack()) { // Regular stack + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64CC && // Win64 has no Red Zone + !MFI->hasCopyImplyingStackAdjustment() && // Don't push and pop. + !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1ec93b5..b723059 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17458,7 +17458,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, // We need a frame pointer because this will get lowered to a PUSH/POP // sequence. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - MFI->setHasOpaqueSPAdjustment(true); + MFI->setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later // during ExpandISelPseudos in EmitInstrWithCustomInserter. return SDValue(); diff --git a/test/CodeGen/X86/x86-repmov-copy-eflags.ll b/test/CodeGen/X86/x86-repmov-copy-eflags.ll new file mode 100644 index 0000000..ad39888 --- /dev/null +++ b/test/CodeGen/X86/x86-repmov-copy-eflags.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i686-pc-windows-msvc18.0.0" + +%struct.T = type { i64, [3 x i32] } + +; Function Attrs: nounwind optsize +define void @f(i8* %p, i8* %q, i32* inalloca nocapture %unused) #0 { +entry: + %g = alloca %struct.T, align 8 + %r = alloca i32, align 8 + store i32 0, i32* %r, align 4 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 24, i32 8, i1 false) + br label %while.body + +while.body: ; preds = %while.body, %entry + %load = load i32, i32* %r, align 4 + %dec = add nsw i32 %load, -1 + store i32 %dec, i32* %r, align 4 + call void @g(%struct.T* %g) + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 + +declare void @g(%struct.T*) + +; CHECK-LABEL: _f: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; CHECK: andl $-8, %esp +; CHECK-NOT: movl %esp, %esi +; CHECK: rep;movsl +; CHECK: leal 8(%esp), %esi + +; CHECK: decl (%esp) +; CHECK: seto %al +; CHECK: lahf +; CHECK: movl %eax, %edi +; CHECK: pushl %esi +; CHECK: calll _g +; CHECK: addl $4, %esp +; CHECK: movl %edi, %eax +; CHECK: addb $127, %al +; CHECK: sahf + +attributes #0 = { nounwind optsize } +attributes #1 = { argmemonly nounwind } diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh index bb1f786..fb50160 100755 --- a/utils/release/test-release.sh +++ b/utils/release/test-release.sh @@ -159,6 +159,12 @@ while [ $# -gt 0 ]; do shift done +if [ "$use_autoconf" = "no" ]; then + # See llvm.org/PR26146. + echo Skipping test-suite when using CMake. + do_test_suite="no" +fi + # Check required arguments. if [ -z "$Release" ]; then echo "error: no release number specified" -- cgit v1.1 From 21029d6a214a88783711894533b519ce0e65cc90 Mon Sep 17 00:00:00 2001 From: dim Date: Fri, 22 Jan 2016 21:16:09 +0000 Subject: Vendor import of llvm release_38 branch r258549: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258549 --- include/llvm/CodeGen/MachineFunction.h | 2 +- include/llvm/CodeGen/SelectionDAGNodes.h | 15 + include/llvm/IR/GlobalValue.h | 4 + include/llvm/Transforms/Utils/Local.h | 19 + include/llvm/Transforms/Utils/SimplifyLibCalls.h | 2 - lib/CodeGen/AsmPrinter/DebugLocEntry.h | 13 +- lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 18 + lib/CodeGen/CodeGenPrepare.cpp | 35 +- lib/CodeGen/MachineFunction.cpp | 2 +- lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 44 +- lib/IR/Globals.cpp | 44 +- lib/Target/AArch64/AArch64ISelLowering.cpp | 9 +- .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp | 14 +- lib/Target/ARM/ARMISelLowering.cpp | 9 +- lib/Target/X86/X86CallingConv.td | 4 +- lib/Target/X86/X86FrameLowering.cpp | 18 +- lib/Target/X86/X86ISelLowering.cpp | 9 +- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp | 187 +-------- lib/Transforms/Utils/InlineFunction.cpp | 331 ++++++++++++++- lib/Transforms/Utils/Local.cpp | 235 ++++++++++- lib/Transforms/Utils/SimplifyLibCalls.cpp | 55 ++- test/CodeGen/AArch64/cxx-tlscc.ll | 27 ++ test/CodeGen/ARM/cse-flags.ll | 43 ++ test/CodeGen/ARM/cxx-tlscc.ll | 11 + test/CodeGen/ARM/memfunc.ll | 18 +- test/CodeGen/X86/2014-05-30-CombineAddNSW.ll | 20 - test/CodeGen/X86/cxx_tlscc64.ll | 27 ++ test/CodeGen/X86/x86-shrink-wrap-unwind.ll | 83 +++- test/DebugInfo/ARM/PR26163.ll | 107 +++++ .../ExecutionEngine/MCJIT/remote/cross-module-a.ll | 2 +- .../ExecutionEngine/MCJIT/remote/multi-module-a.ll | 2 +- .../MCJIT/remote/simpletest-remote.ll | 2 +- test/ExecutionEngine/MCJIT/remote/stubs-remote.ll | 2 +- .../MCJIT/remote/test-common-symbols-remote.ll | 2 +- .../MCJIT/remote/test-data-align-remote.ll | 2 +- .../remote/test-fp-no-external-funcs-remote.ll | 2 +- .../remote/test-global-init-nonzero-remote.ll | 2 +- .../remote/test-global-init-nonzero-sm-pic.ll | 2 +- .../MCJIT/remote/test-ptr-reloc-remote.ll | 2 +- .../MCJIT/remote/test-ptr-reloc-sm-pic.ll | 2 +- .../OrcMCJIT/remote/cross-module-a.ll | 2 +- .../OrcMCJIT/remote/multi-module-a.ll | 2 +- .../OrcMCJIT/remote/simpletest-remote.ll | 2 +- .../OrcMCJIT/remote/stubs-remote.ll | 2 +- .../OrcMCJIT/remote/test-common-symbols-remote.ll | 2 +- .../OrcMCJIT/remote/test-data-align-remote.ll | 2 +- .../remote/test-fp-no-external-funcs-remote.ll | 2 +- .../remote/test-global-init-nonzero-remote.ll | 2 +- .../remote/test-global-init-nonzero-sm-pic.ll | 2 +- .../OrcMCJIT/remote/test-ptr-reloc-remote.ll | 2 +- .../OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll | 2 +- test/MC/AArch64/inst-directive.s | 15 +- .../CodeGenPrepare/ARM/bitreverse-recognize.ll | 37 ++ test/Transforms/CodeGenPrepare/ARM/lit.local.cfg | 3 + test/Transforms/CodeGenPrepare/bitreverse-hang.ll | 53 +++ test/Transforms/Inline/inline-funclets.ll | 455 +++++++++++++++++++++ test/Transforms/InstCombine/bitreverse-hang.ll | 53 +++ .../Transforms/InstCombine/bitreverse-recognize.ll | 114 ------ test/Transforms/InstCombine/cos-2.ll | 16 +- .../InstCombine/double-float-shrink-1.ll | 20 + tools/lli/lli.cpp | 5 +- utils/release/test-release.sh | 21 +- 62 files changed, 1772 insertions(+), 469 deletions(-) create mode 100644 test/CodeGen/ARM/cse-flags.ll delete mode 100644 test/CodeGen/X86/2014-05-30-CombineAddNSW.ll create mode 100644 test/DebugInfo/ARM/PR26163.ll create mode 100644 test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll create mode 100644 test/Transforms/CodeGenPrepare/ARM/lit.local.cfg create mode 100644 test/Transforms/CodeGenPrepare/bitreverse-hang.ll create mode 100644 test/Transforms/Inline/inline-funclets.ll create mode 100644 test/Transforms/InstCombine/bitreverse-hang.ll delete mode 100644 test/Transforms/InstCombine/bitreverse-recognize.ll diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h index 82c30d3..df7c951 100644 --- a/include/llvm/CodeGen/MachineFunction.h +++ b/include/llvm/CodeGen/MachineFunction.h @@ -295,7 +295,7 @@ public: } /// Should we be emitting segmented stack stuff for the function - bool shouldSplitStack(); + bool shouldSplitStack() const; /// getNumBlockIDs - Return the number of MBB ID's allocated. /// diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h index 23816bd..536fc65 100644 --- a/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/include/llvm/CodeGen/SelectionDAGNodes.h @@ -369,6 +369,18 @@ public: (UnsafeAlgebra << 3) | (NoNaNs << 4) | (NoInfs << 5) | (NoSignedZeros << 6) | (AllowReciprocal << 7); } + + /// Clear any flags in this flag set that aren't also set in Flags. + void intersectWith(const SDNodeFlags *Flags) { + NoUnsignedWrap &= Flags->NoUnsignedWrap; + NoSignedWrap &= Flags->NoSignedWrap; + Exact &= Flags->Exact; + UnsafeAlgebra &= Flags->UnsafeAlgebra; + NoNaNs &= Flags->NoNaNs; + NoInfs &= Flags->NoInfs; + NoSignedZeros &= Flags->NoSignedZeros; + AllowReciprocal &= Flags->AllowReciprocal; + } }; /// Represents one node in the SelectionDAG. @@ -682,6 +694,9 @@ public: /// and directly, but it is not to avoid creating a vtable for this class. const SDNodeFlags *getFlags() const; + /// Clear any flags in this node that aren't also set in Flags. + void intersectFlagsWith(const SDNodeFlags *Flags); + /// Return the number of values defined/returned by this operator. unsigned getNumValues() const { return NumValues; } diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h index 4fa4e7d..fa6469a 100644 --- a/include/llvm/IR/GlobalValue.h +++ b/include/llvm/IR/GlobalValue.h @@ -346,6 +346,10 @@ public: return !(isDeclarationForLinker() || isWeakForLinker()); } + // Returns true if the alignment of the value can be unilaterally + // increased. + bool canIncreaseAlignment() const; + /// This method unlinks 'this' from the containing module, but does not delete /// it. virtual void removeFromParent() = 0; diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h index 911c6f1..3ae0165 100644 --- a/include/llvm/Transforms/Utils/Local.h +++ b/include/llvm/Transforms/Utils/Local.h @@ -331,6 +331,25 @@ unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, /// during lowering by the GC infrastructure. bool callsGCLeafFunction(ImmutableCallSite CS); +//===----------------------------------------------------------------------===// +// Intrinsic pattern matching +// + +/// Try and match a bitreverse or bswap idiom. +/// +/// If an idiom is matched, an intrinsic call is inserted before \c I. Any added +/// instructions are returned in \c InsertedInsts. They will all have been added +/// to a basic block. +/// +/// A bitreverse idiom normally requires around 2*BW nodes to be searched (where +/// BW is the bitwidth of the integer type). A bswap idiom requires anywhere up +/// to BW / 4 nodes to be searched, so is significantly faster. +/// +/// This function returns true on a successful match or false otherwise. +bool recognizeBitReverseOrBSwapIdiom( + Instruction *I, bool MatchBSwaps, bool MatchBitReversals, + SmallVectorImpl &InsertedInsts); + } // End llvm namespace #endif diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h index 410a075..fc34f49 100644 --- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -125,8 +125,6 @@ private: Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B); // Math Library Optimizations - Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, bool CheckRetType); - Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B); Value *optimizeCos(CallInst *CI, IRBuilder<> &B); Value *optimizePow(CallInst *CI, IRBuilder<> &B); Value *optimizeExp2(CallInst *CI, IRBuilder<> &B); diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h index bbe5324..b60ab91 100644 --- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h +++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h @@ -93,18 +93,7 @@ public: /// variable, merge them by appending Next's values to the current /// list of values. /// Return true if the merge was successful. - bool MergeValues(const DebugLocEntry &Next) { - if (Begin == Next.Begin) { - auto *Expr = cast_or_null(Values[0].Expression); - auto *NextExpr = cast_or_null(Next.Values[0].Expression); - if (Expr->isBitPiece() && NextExpr->isBitPiece()) { - addValues(Next.Values); - End = Next.End; - return true; - } - } - return false; - } + bool MergeValues(const DebugLocEntry &Next); /// \brief Attempt to merge this DebugLocEntry with Next and return /// true if the merge was successful. Entries can be merged if they diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index a4fb07e..ae62b6b 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -805,6 +805,24 @@ static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) { return (l1 < r2) && (l2 < r1); } +/// \brief If this and Next are describing different pieces of the same +/// variable, merge them by appending Next's values to the current +/// list of values. +/// Return true if the merge was successful. +bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) { + if (Begin == Next.Begin) { + auto *Expr = cast_or_null(Values[0].Expression); + auto *NextExpr = cast_or_null(Next.Values[0].Expression); + if (Expr->isBitPiece() && NextExpr->isBitPiece() && + !piecesOverlap(Expr, NextExpr)) { + addValues(Next.Values); + End = Next.End; + return true; + } + } + return false; +} + /// Build the location list for all DBG_VALUEs in the function that /// describe the same variable. If the ranges of several independent /// pieces of the same variable overlap partially, split them up and diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 03e5778..c8007a5 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1742,8 +1742,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { // over-aligning global variables that have an explicit section is // forbidden. GlobalVariable *GV; - if ((GV = dyn_cast(Val)) && GV->hasUniqueInitializer() && - !GV->hasSection() && GV->getAlignment() < PrefAlign && + if ((GV = dyn_cast(Val)) && GV->canIncreaseAlignment() && + GV->getAlignment() < PrefAlign && DL->getTypeAllocSize(GV->getType()->getElementType()) >= MinSize + Offset2) GV->setAlignment(PrefAlign); @@ -5211,6 +5211,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { return false; } +/// Given an OR instruction, check to see if this is a bitreverse +/// idiom. If so, insert the new intrinsic and return true. +static bool makeBitReverse(Instruction &I, const DataLayout &DL, + const TargetLowering &TLI) { + if (!I.getType()->isIntegerTy() || + !TLI.isOperationLegalOrCustom(ISD::BITREVERSE, + TLI.getValueType(DL, I.getType(), true))) + return false; + + SmallVector Insts; + if (!recognizeBitReverseOrBSwapIdiom(&I, false, true, Insts)) + return false; + Instruction *LastInst = Insts.back(); + I.replaceAllUsesWith(LastInst); + RecursivelyDeleteTriviallyDeadInstructions(&I); + return true; +} + // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. @@ -5224,8 +5242,19 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) { if (ModifiedDT) return true; } - MadeChange |= dupRetToEnableTailCallOpts(&BB); + bool MadeBitReverse = true; + while (TLI && MadeBitReverse) { + MadeBitReverse = false; + for (auto &I : reverse(BB)) { + if (makeBitReverse(I, *DL, *TLI)) { + MadeBitReverse = MadeChange = true; + break; + } + } + } + MadeChange |= dupRetToEnableTailCallOpts(&BB); + return MadeChange; } diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index ca4bb1c..f6604f3 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -163,7 +163,7 @@ getOrCreateJumpTableInfo(unsigned EntryKind) { } /// Should we be emitting segmented stack stuff for the function -bool MachineFunction::shouldSplitStack() { +bool MachineFunction::shouldSplitStack() const { return getFunction()->hasFnAttribute("split-stack"); } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 96bf914..893871f 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -377,22 +377,6 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID, } } -/// Add logical or fast math flag values to FoldingSetNodeID value. -static void AddNodeIDFlags(FoldingSetNodeID &ID, unsigned Opcode, - const SDNodeFlags *Flags) { - if (!isBinOpWithFlags(Opcode)) - return; - - unsigned RawFlags = 0; - if (Flags) - RawFlags = Flags->getRawFlags(); - ID.AddInteger(RawFlags); -} - -static void AddNodeIDFlags(FoldingSetNodeID &ID, const SDNode *N) { - AddNodeIDFlags(ID, N->getOpcode(), N->getFlags()); -} - static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC, SDVTList VTList, ArrayRef OpList) { AddNodeIDOpcode(ID, OpC); @@ -528,8 +512,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { } } // end switch (N->getOpcode()) - AddNodeIDFlags(ID, N); - // Target specific memory nodes could also have address spaces to check. if (N->isTargetMemoryOpcode()) ID.AddInteger(cast(N)->getPointerInfo().getAddrSpace()); @@ -851,6 +833,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op, AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops); AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos); + if (Node) + if (const SDNodeFlags *Flags = N->getFlags()) + Node->intersectFlagsWith(Flags); return Node; } @@ -869,6 +854,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops); AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos); + if (Node) + if (const SDNodeFlags *Flags = N->getFlags()) + Node->intersectFlagsWith(Flags); return Node; } @@ -886,6 +874,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef Ops, AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops); AddNodeIDCustom(ID, N); SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos); + if (Node) + if (const SDNodeFlags *Flags = N->getFlags()) + Node->intersectFlagsWith(Flags); return Node; } @@ -3892,10 +3883,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue Ops[] = {N1, N2}; FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); - AddNodeIDFlags(ID, Opcode, Flags); void *IP = nullptr; - if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) + if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) { + if (Flags) + E->intersectFlagsWith(Flags); return SDValue(E, 0); + } N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags); @@ -6249,10 +6242,12 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList, if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) { FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTList, Ops); - AddNodeIDFlags(ID, Opcode, Flags); void *IP = nullptr; - if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP)) + if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP)) { + if (Flags) + E->intersectFlagsWith(Flags); return E; + } } return nullptr; } @@ -6948,6 +6943,11 @@ const SDNodeFlags *SDNode::getFlags() const { return nullptr; } +void SDNode::intersectFlagsWith(const SDNodeFlags *Flags) { + if (auto *FlagsNode = dyn_cast(this)) + FlagsNode->Flags.intersectWith(Flags); +} + SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { assert(N->getNumValues() == 1 && "Can't unroll a vector with multiple results!"); diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp index 6159f93..a61b62b 100644 --- a/lib/IR/Globals.cpp +++ b/lib/IR/Globals.cpp @@ -12,11 +12,12 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/GlobalValue.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Triple.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -134,6 +135,47 @@ bool GlobalValue::isDeclaration() const { return false; } +bool GlobalValue::canIncreaseAlignment() const { + // Firstly, can only increase the alignment of a global if it + // is a strong definition. + if (!isStrongDefinitionForLinker()) + return false; + + // It also has to either not have a section defined, or, not have + // alignment specified. (If it is assigned a section, the global + // could be densely packed with other objects in the section, and + // increasing the alignment could cause padding issues.) + if (hasSection() && getAlignment() > 0) + return false; + + // On ELF platforms, we're further restricted in that we can't + // increase the alignment of any variable which might be emitted + // into a shared library, and which is exported. If the main + // executable accesses a variable found in a shared-lib, the main + // exe actually allocates memory for and exports the symbol ITSELF, + // overriding the symbol found in the library. That is, at link + // time, the observed alignment of the variable is copied into the + // executable binary. (A COPY relocation is also generated, to copy + // the initial data from the shadowed variable in the shared-lib + // into the location in the main binary, before running code.) + // + // And thus, even though you might think you are defining the + // global, and allocating the memory for the global in your object + // file, and thus should be able to set the alignment arbitrarily, + // that's not actually true. Doing so can cause an ABI breakage; an + // executable might have already been built with the previous + // alignment of the variable, and then assuming an increased + // alignment will be incorrect. + + // Conservatively assume ELF if there's no parent pointer. + bool isELF = + (!Parent || Triple(Parent->getTargetTriple()).isOSBinFormatELF()); + if (isELF && hasDefaultVisibility() && !hasLocalLinkage()) + return false; + + return true; +} + //===----------------------------------------------------------------------===// // GlobalVariable Implementation //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 4ecfbe9..9b73c5e 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10133,6 +10133,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR( const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (AArch64::GPR64RegClass.contains(*I)) @@ -10152,13 +10153,13 @@ void AArch64TargetLowering::insertCopiesSplitCSR( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); - BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - NewVR) + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); + // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) - BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - *I) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index d26604f..685907a 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -112,9 +112,21 @@ public: MCELFStreamer::EmitInstruction(Inst, STI); } + /// Emit a 32-bit value as an instruction. This is only used for the .inst + /// directive, EmitInstruction should be used in other cases. void emitInst(uint32_t Inst) { + char Buffer[4]; + + // We can't just use EmitIntValue here, as that will emit a data mapping + // symbol, and swap the endianness on big-endian systems (instructions are + // always little-endian). + for (unsigned I = 0; I < 4; ++I) { + Buffer[I] = uint8_t(Inst); + Inst >>= 8; + } + EmitA64MappingSymbol(); - MCELFStreamer::EmitIntValue(Inst, 4); + MCELFStreamer::EmitBytes(StringRef(Buffer, 4)); } /// This is one of the functions used to emit data into an ELF section, so the diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 37c0795..978e99c 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -12423,6 +12423,7 @@ void ARMTargetLowering::insertCopiesSplitCSR( const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (ARM::GPRRegClass.contains(*I)) @@ -12442,13 +12443,13 @@ void ARMTargetLowering::insertCopiesSplitCSR( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); - BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - NewVR) + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); + // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) - BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - *I) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index e8b96e7..ed2e880 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -832,10 +832,10 @@ def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI, R8, R9, R10, R11)>; // CSRs that are handled by prologue, epilogue. -def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>; +def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add RBP)>; // CSRs that are handled explicitly via copies. -def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>; +def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)>; // All GPRs - except r11 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 8632bb8..7f8ce47 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -2031,6 +2031,10 @@ void X86FrameLowering::adjustForSegmentedStacks( unsigned TlsReg, TlsOffset; DebugLoc DL; + // To support shrink-wrapping we would need to insert the new blocks + // at the right place and update the branches to PrologueMBB. + assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet"); + unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "Scratch register is live-in"); @@ -2271,6 +2275,11 @@ void X86FrameLowering::adjustForHiPEPrologue( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { MachineFrameInfo *MFI = MF.getFrameInfo(); DebugLoc DL; + + // To support shrink-wrapping we would need to insert the new blocks + // at the right place and update the branches to PrologueMBB. + assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet"); + // HiPE-specific values const unsigned HipeLeafWords = 24; const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; @@ -2584,7 +2593,14 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { // If we may need to emit frameless compact unwind information, give // up as this is currently broken: PR25614. - return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF); + return (MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) && + // The lowering of segmented stack and HiPE only support entry blocks + // as prologue blocks: PR26107. + // This limitation may be lifted if we fix: + // - adjustForSegmentedStacks + // - adjustForHiPEPrologue + MF.getFunction()->getCallingConv() != CallingConv::HiPE && + !MF.shouldSplitStack(); } MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b723059..6904714 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -28908,6 +28908,7 @@ void X86TargetLowering::insertCopiesSplitCSR( const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (X86::GR64RegClass.contains(*I)) @@ -28925,13 +28926,13 @@ void X86TargetLowering::insertCopiesSplitCSR( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); - BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - NewVR) + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); + // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) - BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), - *I) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 95c50d3..76cefd9 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Utils/CmpInstAnalysis.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; @@ -1565,190 +1566,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return Changed ? &I : nullptr; } - -/// Analyze the specified subexpression and see if it is capable of providing -/// pieces of a bswap or bitreverse. The subexpression provides a potential -/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in -/// the output of the expression came from a corresponding bit in some other -/// value. This function is recursive, and the end result is a mapping of -/// (value, bitnumber) to bitnumber. It is the caller's responsibility to -/// validate that all `value`s are identical and that the bitnumber to bitnumber -/// mapping is correct for a bswap or bitreverse. -/// -/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know -/// that the expression deposits the low byte of %X into the high byte of the -/// result and that all other bits are zero. This expression is accepted, -/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7]. -/// -/// This function returns true if the match was unsuccessful and false if so. -/// On entry to the function the "OverallLeftShift" is a signed integer value -/// indicating the number of bits that the subexpression is later shifted. For -/// example, if the expression is later right shifted by 16 bits, the -/// OverallLeftShift value would be -16 on entry. This is used to specify which -/// bits of BitValues are actually being set. -/// -/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding -/// bit is masked to zero by a user. For example, in (X & 255), X will be -/// processed with a bytemask of 255. BitMask is always in the local -/// (OverallLeftShift) coordinate space. -/// -static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask, - SmallVectorImpl &BitValues, - SmallVectorImpl &BitProvenance) { - if (Instruction *I = dyn_cast(V)) { - // If this is an or instruction, it may be an inner node of the bswap. - if (I->getOpcode() == Instruction::Or) - return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, - BitValues, BitProvenance) || - CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask, - BitValues, BitProvenance); - - // If this is a logical shift by a constant, recurse with OverallLeftShift - // and BitMask adjusted. - if (I->isLogicalShift() && isa(I->getOperand(1))) { - unsigned ShAmt = - cast(I->getOperand(1))->getLimitedValue(~0U); - // Ensure the shift amount is defined. - if (ShAmt > BitValues.size()) - return true; - - unsigned BitShift = ShAmt; - if (I->getOpcode() == Instruction::Shl) { - // X << C -> collect(X, +C) - OverallLeftShift += BitShift; - BitMask = BitMask.lshr(BitShift); - } else { - // X >>u C -> collect(X, -C) - OverallLeftShift -= BitShift; - BitMask = BitMask.shl(BitShift); - } - - if (OverallLeftShift >= (int)BitValues.size()) - return true; - if (OverallLeftShift <= -(int)BitValues.size()) - return true; - - return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, - BitValues, BitProvenance); - } - - // If this is a logical 'and' with a mask that clears bits, clear the - // corresponding bits in BitMask. - if (I->getOpcode() == Instruction::And && - isa(I->getOperand(1))) { - unsigned NumBits = BitValues.size(); - APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1); - const APInt &AndMask = cast(I->getOperand(1))->getValue(); - - for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) { - // If this bit is masked out by a later operation, we don't care what - // the and mask is. - if (BitMask[i] == 0) - continue; - - // If the AndMask is zero for this bit, clear the bit. - APInt MaskB = AndMask & Bit; - if (MaskB == 0) { - BitMask.clearBit(i); - continue; - } - - // Otherwise, this bit is kept. - } - - return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, - BitValues, BitProvenance); - } - } - - // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be - // the input value to the bswap/bitreverse. To be part of a bswap or - // bitreverse we must be demanding a contiguous range of bits from it. - unsigned InputBitLen = BitMask.countPopulation(); - unsigned InputBitNo = BitMask.countTrailingZeros(); - if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo != - InputBitLen) - // Not a contiguous set range of bits! - return true; - - // We know we're moving a contiguous range of bits from the input to the - // output. Record which bits in the output came from which bits in the input. - unsigned DestBitNo = InputBitNo + OverallLeftShift; - for (unsigned I = 0; I < InputBitLen; ++I) - BitProvenance[DestBitNo + I] = InputBitNo + I; - - // If the destination bit value is already defined, the values are or'd - // together, which isn't a bswap/bitreverse (unless it's an or of the same - // bits). - if (BitValues[DestBitNo] && BitValues[DestBitNo] != V) - return true; - for (unsigned I = 0; I < InputBitLen; ++I) - BitValues[DestBitNo + I] = V; - - return false; -} - -static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To, - unsigned BitWidth) { - if (From % 8 != To % 8) - return false; - // Convert from bit indices to byte indices and check for a byte reversal. - From >>= 3; - To >>= 3; - BitWidth >>= 3; - return From == BitWidth - To - 1; -} - -static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To, - unsigned BitWidth) { - return From == BitWidth - To - 1; -} - /// Given an OR instruction, check to see if this is a bswap or bitreverse /// idiom. If so, insert the new intrinsic and return it. Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) { - IntegerType *ITy = dyn_cast(I.getType()); - if (!ITy) - return nullptr; // Can't do vectors. - unsigned BW = ITy->getBitWidth(); - - /// We keep track of which bit (BitProvenance) inside which value (BitValues) - /// defines each bit in the result. - SmallVector BitValues(BW, nullptr); - SmallVector BitProvenance(BW, -1); - - // Try to find all the pieces corresponding to the bswap. - APInt BitMask = APInt::getAllOnesValue(BitValues.size()); - if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance)) - return nullptr; - - // Check to see if all of the bits come from the same value. - Value *V = BitValues[0]; - if (!V) return nullptr; // Didn't find a bit? Must be zero. - - if (!std::all_of(BitValues.begin(), BitValues.end(), - [&](const Value *X) { return X == V; })) - return nullptr; - - // Now, is the bit permutation correct for a bswap or a bitreverse? We can - // only byteswap values with an even number of bytes. - bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;; - for (unsigned i = 0, e = BitValues.size(); i != e; ++i) { - OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW); - OKForBitReverse &= - bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW); - } - - Intrinsic::ID Intrin; - if (OKForBSwap) - Intrin = Intrinsic::bswap; - else if (OKForBitReverse) - Intrin = Intrinsic::bitreverse; - else + SmallVector Insts; + if (!recognizeBitReverseOrBSwapIdiom(&I, true, false, Insts)) return nullptr; + Instruction *LastInst = Insts.pop_back_val(); + LastInst->removeFromParent(); - Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy); - return CallInst::Create(F, V); + for (auto *Inst : Insts) + Worklist.Add(Inst); + return LastInst; } /// We have an expression of the form (A&C)|(B&D). Check if A is (cond?-1:0) diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 1457411..79282a2 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -179,13 +179,244 @@ void LandingPadInliningInfo::forwardResume( RI->eraseFromParent(); } +/// Helper for getUnwindDestToken/getUnwindDestTokenHelper. +static Value *getParentPad(Value *EHPad) { + if (auto *FPI = dyn_cast(EHPad)) + return FPI->getParentPad(); + return cast(EHPad)->getParentPad(); +} + +typedef DenseMap UnwindDestMemoTy; + +/// Helper for getUnwindDestToken that does the descendant-ward part of +/// the search. +static Value *getUnwindDestTokenHelper(Instruction *EHPad, + UnwindDestMemoTy &MemoMap) { + SmallVector Worklist(1, EHPad); + + while (!Worklist.empty()) { + Instruction *CurrentPad = Worklist.pop_back_val(); + // We only put pads on the worklist that aren't in the MemoMap. When + // we find an unwind dest for a pad we may update its ancestors, but + // the queue only ever contains uncles/great-uncles/etc. of CurrentPad, + // so they should never get updated while queued on the worklist. + assert(!MemoMap.count(CurrentPad)); + Value *UnwindDestToken = nullptr; + if (auto *CatchSwitch = dyn_cast(CurrentPad)) { + if (CatchSwitch->hasUnwindDest()) { + UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI(); + } else { + // Catchswitch doesn't have a 'nounwind' variant, and one might be + // annotated as "unwinds to caller" when really it's nounwind (see + // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the + // parent's unwind dest from this. We can check its catchpads' + // descendants, since they might include a cleanuppad with an + // "unwinds to caller" cleanupret, which can be trusted. + for (auto HI = CatchSwitch->handler_begin(), + HE = CatchSwitch->handler_end(); + HI != HE && !UnwindDestToken; ++HI) { + BasicBlock *HandlerBlock = *HI; + auto *CatchPad = cast(HandlerBlock->getFirstNonPHI()); + for (User *Child : CatchPad->users()) { + // Intentionally ignore invokes here -- since the catchswitch is + // marked "unwind to caller", it would be a verifier error if it + // contained an invoke which unwinds out of it, so any invoke we'd + // encounter must unwind to some child of the catch. + if (!isa(Child) && !isa(Child)) + continue; + + Instruction *ChildPad = cast(Child); + auto Memo = MemoMap.find(ChildPad); + if (Memo == MemoMap.end()) { + // Haven't figure out this child pad yet; queue it. + Worklist.push_back(ChildPad); + continue; + } + // We've already checked this child, but might have found that + // it offers no proof either way. + Value *ChildUnwindDestToken = Memo->second; + if (!ChildUnwindDestToken) + continue; + // We already know the child's unwind dest, which can either + // be ConstantTokenNone to indicate unwind to caller, or can + // be another child of the catchpad. Only the former indicates + // the unwind dest of the catchswitch. + if (isa(ChildUnwindDestToken)) { + UnwindDestToken = ChildUnwindDestToken; + break; + } + assert(getParentPad(ChildUnwindDestToken) == CatchPad); + } + } + } + } else { + auto *CleanupPad = cast(CurrentPad); + for (User *U : CleanupPad->users()) { + if (auto *CleanupRet = dyn_cast(U)) { + if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest()) + UnwindDestToken = RetUnwindDest->getFirstNonPHI(); + else + UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext()); + break; + } + Value *ChildUnwindDestToken; + if (auto *Invoke = dyn_cast(U)) { + ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI(); + } else if (isa(U) || isa(U)) { + Instruction *ChildPad = cast(U); + auto Memo = MemoMap.find(ChildPad); + if (Memo == MemoMap.end()) { + // Haven't resolved this child yet; queue it and keep searching. + Worklist.push_back(ChildPad); + continue; + } + // We've checked this child, but still need to ignore it if it + // had no proof either way. + ChildUnwindDestToken = Memo->second; + if (!ChildUnwindDestToken) + continue; + } else { + // Not a relevant user of the cleanuppad + continue; + } + // In a well-formed program, the child/invoke must either unwind to + // an(other) child of the cleanup, or exit the cleanup. In the + // first case, continue searching. + if (isa(ChildUnwindDestToken) && + getParentPad(ChildUnwindDestToken) == CleanupPad) + continue; + UnwindDestToken = ChildUnwindDestToken; + break; + } + } + // If we haven't found an unwind dest for CurrentPad, we may have queued its + // children, so move on to the next in the worklist. + if (!UnwindDestToken) + continue; + + // Now we know that CurrentPad unwinds to UnwindDestToken. It also exits + // any ancestors of CurrentPad up to but not including UnwindDestToken's + // parent pad. Record this in the memo map, and check to see if the + // original EHPad being queried is one of the ones exited. + Value *UnwindParent; + if (auto *UnwindPad = dyn_cast(UnwindDestToken)) + UnwindParent = getParentPad(UnwindPad); + else + UnwindParent = nullptr; + bool ExitedOriginalPad = false; + for (Instruction *ExitedPad = CurrentPad; + ExitedPad && ExitedPad != UnwindParent; + ExitedPad = dyn_cast(getParentPad(ExitedPad))) { + // Skip over catchpads since they just follow their catchswitches. + if (isa(ExitedPad)) + continue; + MemoMap[ExitedPad] = UnwindDestToken; + ExitedOriginalPad |= (ExitedPad == EHPad); + } + + if (ExitedOriginalPad) + return UnwindDestToken; + + // Continue the search. + } + + // No definitive information is contained within this funclet. + return nullptr; +} + +/// Given an EH pad, find where it unwinds. If it unwinds to an EH pad, +/// return that pad instruction. If it unwinds to caller, return +/// ConstantTokenNone. If it does not have a definitive unwind destination, +/// return nullptr. +/// +/// This routine gets invoked for calls in funclets in inlinees when inlining +/// an invoke. Since many funclets don't have calls inside them, it's queried +/// on-demand rather than building a map of pads to unwind dests up front. +/// Determining a funclet's unwind dest may require recursively searching its +/// descendants, and also ancestors and cousins if the descendants don't provide +/// an answer. Since most funclets will have their unwind dest immediately +/// available as the unwind dest of a catchswitch or cleanupret, this routine +/// searches top-down from the given pad and then up. To avoid worst-case +/// quadratic run-time given that approach, it uses a memo map to avoid +/// re-processing funclet trees. The callers that rewrite the IR as they go +/// take advantage of this, for correctness, by checking/forcing rewritten +/// pads' entries to match the original callee view. +static Value *getUnwindDestToken(Instruction *EHPad, + UnwindDestMemoTy &MemoMap) { + // Catchpads unwind to the same place as their catchswitch; + // redirct any queries on catchpads so the code below can + // deal with just catchswitches and cleanuppads. + if (auto *CPI = dyn_cast(EHPad)) + EHPad = CPI->getCatchSwitch(); + + // Check if we've already determined the unwind dest for this pad. + auto Memo = MemoMap.find(EHPad); + if (Memo != MemoMap.end()) + return Memo->second; + + // Search EHPad and, if necessary, its descendants. + Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap); + assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0)); + if (UnwindDestToken) + return UnwindDestToken; + + // No information is available for this EHPad from itself or any of its + // descendants. An unwind all the way out to a pad in the caller would + // need also to agree with the unwind dest of the parent funclet, so + // search up the chain to try to find a funclet with information. Put + // null entries in the memo map to avoid re-processing as we go up. + MemoMap[EHPad] = nullptr; + Instruction *LastUselessPad = EHPad; + Value *AncestorToken; + for (AncestorToken = getParentPad(EHPad); + auto *AncestorPad = dyn_cast(AncestorToken); + AncestorToken = getParentPad(AncestorToken)) { + // Skip over catchpads since they just follow their catchswitches. + if (isa(AncestorPad)) + continue; + assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]); + auto AncestorMemo = MemoMap.find(AncestorPad); + if (AncestorMemo == MemoMap.end()) { + UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap); + } else { + UnwindDestToken = AncestorMemo->second; + } + if (UnwindDestToken) + break; + LastUselessPad = AncestorPad; + } + + // Since the whole tree under LastUselessPad has no information, it all must + // match UnwindDestToken; record that to avoid repeating the search. + SmallVector Worklist(1, LastUselessPad); + while (!Worklist.empty()) { + Instruction *UselessPad = Worklist.pop_back_val(); + assert(!MemoMap.count(UselessPad) || MemoMap[UselessPad] == nullptr); + MemoMap[UselessPad] = UnwindDestToken; + if (auto *CatchSwitch = dyn_cast(UselessPad)) { + for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) + for (User *U : HandlerBlock->getFirstNonPHI()->users()) + if (isa(U) || isa(U)) + Worklist.push_back(cast(U)); + } else { + assert(isa(UselessPad)); + for (User *U : UselessPad->users()) + if (isa(U) || isa(U)) + Worklist.push_back(cast(U)); + } + } + + return UnwindDestToken; +} + /// When we inline a basic block into an invoke, /// we have to turn all of the calls that can throw into invokes. /// This function analyze BB to see if there are any calls, and if so, /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI /// nodes in that block with the values specified in InvokeDestPHIValues. -static BasicBlock * -HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) { +static BasicBlock *HandleCallsInBlockInlinedThroughInvoke( + BasicBlock *BB, BasicBlock *UnwindEdge, + UnwindDestMemoTy *FuncletUnwindMap = nullptr) { for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { Instruction *I = &*BBI++; @@ -196,6 +427,31 @@ HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) { if (!CI || CI->doesNotThrow() || isa(CI->getCalledValue())) continue; + if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) { + // This call is nested inside a funclet. If that funclet has an unwind + // destination within the inlinee, then unwinding out of this call would + // be UB. Rewriting this call to an invoke which targets the inlined + // invoke's unwind dest would give the call's parent funclet multiple + // unwind destinations, which is something that subsequent EH table + // generation can't handle and that the veirifer rejects. So when we + // see such a call, leave it as a call. + auto *FuncletPad = cast(FuncletBundle->Inputs[0]); + Value *UnwindDestToken = + getUnwindDestToken(FuncletPad, *FuncletUnwindMap); + if (UnwindDestToken && !isa(UnwindDestToken)) + continue; +#ifndef NDEBUG + Instruction *MemoKey; + if (auto *CatchPad = dyn_cast(FuncletPad)) + MemoKey = CatchPad->getCatchSwitch(); + else + MemoKey = FuncletPad; + assert(FuncletUnwindMap->count(MemoKey) && + (*FuncletUnwindMap)[MemoKey] == UnwindDestToken && + "must get memoized to avoid confusing later searches"); +#endif // NDEBUG + } + // Convert this function call into an invoke instruction. First, split the // basic block. BasicBlock *Split = @@ -328,13 +584,23 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, // This connects all the instructions which 'unwind to caller' to the invoke // destination. + UnwindDestMemoTy FuncletUnwindMap; for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); BB != E; ++BB) { if (auto *CRI = dyn_cast(BB->getTerminator())) { if (CRI->unwindsToCaller()) { - CleanupReturnInst::Create(CRI->getCleanupPad(), UnwindDest, CRI); + auto *CleanupPad = CRI->getCleanupPad(); + CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI); CRI->eraseFromParent(); UpdatePHINodes(&*BB); + // Finding a cleanupret with an unwind destination would confuse + // subsequent calls to getUnwindDestToken, so map the cleanuppad + // to short-circuit any such calls and recognize this as an "unwind + // to caller" cleanup. + assert(!FuncletUnwindMap.count(CleanupPad) || + isa(FuncletUnwindMap[CleanupPad])); + FuncletUnwindMap[CleanupPad] = + ConstantTokenNone::get(Caller->getContext()); } } @@ -345,12 +611,41 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, Instruction *Replacement = nullptr; if (auto *CatchSwitch = dyn_cast(I)) { if (CatchSwitch->unwindsToCaller()) { + Value *UnwindDestToken; + if (auto *ParentPad = + dyn_cast(CatchSwitch->getParentPad())) { + // This catchswitch is nested inside another funclet. If that + // funclet has an unwind destination within the inlinee, then + // unwinding out of this catchswitch would be UB. Rewriting this + // catchswitch to unwind to the inlined invoke's unwind dest would + // give the parent funclet multiple unwind destinations, which is + // something that subsequent EH table generation can't handle and + // that the veirifer rejects. So when we see such a call, leave it + // as "unwind to caller". + UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap); + if (UnwindDestToken && !isa(UnwindDestToken)) + continue; + } else { + // This catchswitch has no parent to inherit constraints from, and + // none of its descendants can have an unwind edge that exits it and + // targets another funclet in the inlinee. It may or may not have a + // descendant that definitively has an unwind to caller. In either + // case, we'll have to assume that any unwinds out of it may need to + // be routed to the caller, so treat it as though it has a definitive + // unwind to caller. + UnwindDestToken = ConstantTokenNone::get(Caller->getContext()); + } auto *NewCatchSwitch = CatchSwitchInst::Create( CatchSwitch->getParentPad(), UnwindDest, CatchSwitch->getNumHandlers(), CatchSwitch->getName(), CatchSwitch); for (BasicBlock *PadBB : CatchSwitch->handlers()) NewCatchSwitch->addHandler(PadBB); + // Propagate info for the old catchswitch over to the new one in + // the unwind map. This also serves to short-circuit any subsequent + // checks for the unwind dest of this catchswitch, which would get + // confused if they found the outer handler in the callee. + FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken; Replacement = NewCatchSwitch; } } else if (!isa(I)) { @@ -369,8 +664,8 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); BB != E; ++BB) - if (BasicBlock *NewBB = - HandleCallsInBlockInlinedThroughInvoke(&*BB, UnwindDest)) + if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke( + &*BB, UnwindDest, &FuncletUnwindMap)) // Update any PHI nodes in the exceptional block to indicate that there // is now a new entry in them. UpdatePHINodes(NewBB); @@ -1415,6 +1710,20 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } } + // If we are inlining for an invoke instruction, we must make sure to rewrite + // any call instructions into invoke instructions. This is sensitive to which + // funclet pads were top-level in the inlinee, so must be done before + // rewriting the "parent pad" links. + if (auto *II = dyn_cast(TheCall)) { + BasicBlock *UnwindDest = II->getUnwindDest(); + Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); + if (isa(FirstNonPHI)) { + HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); + } else { + HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo); + } + } + // Update the lexical scopes of the new funclets and callsites. // Anything that had 'none' as its parent is now nested inside the callsite's // EHPad. @@ -1472,18 +1781,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } } - // If we are inlining for an invoke instruction, we must make sure to rewrite - // any call instructions into invoke instructions. - if (auto *II = dyn_cast(TheCall)) { - BasicBlock *UnwindDest = II->getUnwindDest(); - Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); - if (isa(FirstNonPHI)) { - HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); - } else { - HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo); - } - } - // Handle any inlined musttail call sites. In order for a new call site to be // musttail, the source of the clone and the inlined call site must have been // musttail. Therefore it's safe to return without merging control into the diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index d2793e5..abc9b65 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -944,37 +944,44 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { static unsigned enforceKnownAlignment(Value *V, unsigned Align, unsigned PrefAlign, const DataLayout &DL) { + assert(PrefAlign > Align); + V = V->stripPointerCasts(); if (AllocaInst *AI = dyn_cast(V)) { + // TODO: ideally, computeKnownBits ought to have used + // AllocaInst::getAlignment() in its computation already, making + // the below max redundant. But, as it turns out, + // stripPointerCasts recurses through infinite layers of bitcasts, + // while computeKnownBits is not allowed to traverse more than 6 + // levels. + Align = std::max(AI->getAlignment(), Align); + if (PrefAlign <= Align) + return Align; + // If the preferred alignment is greater than the natural stack alignment // then don't round up. This avoids dynamic stack realignment. if (DL.exceedsNaturalStackAlignment(PrefAlign)) return Align; - // If there is a requested alignment and if this is an alloca, round up. - if (AI->getAlignment() >= PrefAlign) - return AI->getAlignment(); AI->setAlignment(PrefAlign); return PrefAlign; } if (auto *GO = dyn_cast(V)) { + // TODO: as above, this shouldn't be necessary. + Align = std::max(GO->getAlignment(), Align); + if (PrefAlign <= Align) + return Align; + // If there is a large requested alignment and we can, bump up the alignment // of the global. If the memory we set aside for the global may not be the // memory used by the final program then it is impossible for us to reliably // enforce the preferred alignment. - if (!GO->isStrongDefinitionForLinker()) + if (!GO->canIncreaseAlignment()) return Align; - if (GO->getAlignment() >= PrefAlign) - return GO->getAlignment(); - // We can only increase the alignment of the global if it has no alignment - // specified or if it is not assigned a section. If it is assigned a - // section, the global could be densely packed with other objects in the - // section, increasing the alignment could cause padding issues. - if (!GO->hasSection() || GO->getAlignment() == 0) - GO->setAlignment(PrefAlign); - return GO->getAlignment(); + GO->setAlignment(PrefAlign); + return PrefAlign; } return Align; @@ -1585,3 +1592,205 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) { return false; } + +/// A potential constituent of a bitreverse or bswap expression. See +/// collectBitParts for a fuller explanation. +struct BitPart { + BitPart(Value *P, unsigned BW) : Provider(P) { + Provenance.resize(BW); + } + + /// The Value that this is a bitreverse/bswap of. + Value *Provider; + /// The "provenance" of each bit. Provenance[A] = B means that bit A + /// in Provider becomes bit B in the result of this expression. + SmallVector Provenance; // int8_t means max size is i128. + + enum { Unset = -1 }; +}; + +/// Analyze the specified subexpression and see if it is capable of providing +/// pieces of a bswap or bitreverse. The subexpression provides a potential +/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in +/// the output of the expression came from a corresponding bit in some other +/// value. This function is recursive, and the end result is a mapping of +/// bitnumber to bitnumber. It is the caller's responsibility to validate that +/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse. +/// +/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know +/// that the expression deposits the low byte of %X into the high byte of the +/// result and that all other bits are zero. This expression is accepted and a +/// BitPart is returned with Provider set to %X and Provenance[24-31] set to +/// [0-7]. +/// +/// To avoid revisiting values, the BitPart results are memoized into the +/// provided map. To avoid unnecessary copying of BitParts, BitParts are +/// constructed in-place in the \c BPS map. Because of this \c BPS needs to +/// store BitParts objects, not pointers. As we need the concept of a nullptr +/// BitParts (Value has been analyzed and the analysis failed), we an Optional +/// type instead to provide the same functionality. +/// +/// Because we pass around references into \c BPS, we must use a container that +/// does not invalidate internal references (std::map instead of DenseMap). +/// +static const Optional & +collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals, + std::map> &BPS) { + auto I = BPS.find(V); + if (I != BPS.end()) + return I->second; + + auto &Result = BPS[V] = None; + auto BitWidth = cast(V->getType())->getBitWidth(); + + if (Instruction *I = dyn_cast(V)) { + // If this is an or instruction, it may be an inner node of the bswap. + if (I->getOpcode() == Instruction::Or) { + auto &A = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS); + auto &B = collectBitParts(I->getOperand(1), MatchBSwaps, + MatchBitReversals, BPS); + if (!A || !B) + return Result; + + // Try and merge the two together. + if (!A->Provider || A->Provider != B->Provider) + return Result; + + Result = BitPart(A->Provider, BitWidth); + for (unsigned i = 0; i < A->Provenance.size(); ++i) { + if (A->Provenance[i] != BitPart::Unset && + B->Provenance[i] != BitPart::Unset && + A->Provenance[i] != B->Provenance[i]) + return Result = None; + + if (A->Provenance[i] == BitPart::Unset) + Result->Provenance[i] = B->Provenance[i]; + else + Result->Provenance[i] = A->Provenance[i]; + } + + return Result; + } + + // If this is a logical shift by a constant, recurse then shift the result. + if (I->isLogicalShift() && isa(I->getOperand(1))) { + unsigned BitShift = + cast(I->getOperand(1))->getLimitedValue(~0U); + // Ensure the shift amount is defined. + if (BitShift > BitWidth) + return Result; + + auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS); + if (!Res) + return Result; + Result = Res; + + // Perform the "shift" on BitProvenance. + auto &P = Result->Provenance; + if (I->getOpcode() == Instruction::Shl) { + P.erase(std::prev(P.end(), BitShift), P.end()); + P.insert(P.begin(), BitShift, BitPart::Unset); + } else { + P.erase(P.begin(), std::next(P.begin(), BitShift)); + P.insert(P.end(), BitShift, BitPart::Unset); + } + + return Result; + } + + // If this is a logical 'and' with a mask that clears bits, recurse then + // unset the appropriate bits. + if (I->getOpcode() == Instruction::And && + isa(I->getOperand(1))) { + APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1); + const APInt &AndMask = cast(I->getOperand(1))->getValue(); + + // Check that the mask allows a multiple of 8 bits for a bswap, for an + // early exit. + unsigned NumMaskedBits = AndMask.countPopulation(); + if (!MatchBitReversals && NumMaskedBits % 8 != 0) + return Result; + + auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps, + MatchBitReversals, BPS); + if (!Res) + return Result; + Result = Res; + + for (unsigned i = 0; i < BitWidth; ++i, Bit <<= 1) + // If the AndMask is zero for this bit, clear the bit. + if ((AndMask & Bit) == 0) + Result->Provenance[i] = BitPart::Unset; + + return Result; + } + } + + // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be + // the input value to the bswap/bitreverse. + Result = BitPart(V, BitWidth); + for (unsigned i = 0; i < BitWidth; ++i) + Result->Provenance[i] = i; + return Result; +} + +static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To, + unsigned BitWidth) { + if (From % 8 != To % 8) + return false; + // Convert from bit indices to byte indices and check for a byte reversal. + From >>= 3; + To >>= 3; + BitWidth >>= 3; + return From == BitWidth - To - 1; +} + +static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To, + unsigned BitWidth) { + return From == BitWidth - To - 1; +} + +/// Given an OR instruction, check to see if this is a bitreverse +/// idiom. If so, insert the new intrinsic and return true. +bool llvm::recognizeBitReverseOrBSwapIdiom( + Instruction *I, bool MatchBSwaps, bool MatchBitReversals, + SmallVectorImpl &InsertedInsts) { + if (Operator::getOpcode(I) != Instruction::Or) + return false; + if (!MatchBSwaps && !MatchBitReversals) + return false; + IntegerType *ITy = dyn_cast(I->getType()); + if (!ITy || ITy->getBitWidth() > 128) + return false; // Can't do vectors or integers > 128 bits. + unsigned BW = ITy->getBitWidth(); + + // Try to find all the pieces corresponding to the bswap. + std::map> BPS; + auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS); + if (!Res) + return false; + auto &BitProvenance = Res->Provenance; + + // Now, is the bit permutation correct for a bswap or a bitreverse? We can + // only byteswap values with an even number of bytes. + bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true; + for (unsigned i = 0; i < BW; ++i) { + OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW); + OKForBitReverse &= + bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW); + } + + Intrinsic::ID Intrin; + if (OKForBSwap && MatchBSwaps) + Intrin = Intrinsic::bswap; + else if (OKForBitReverse && MatchBitReversals) + Intrin = Intrinsic::bitreverse; + else + return false; + + Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy); + InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I)); + return true; +} diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index dc07440..2f3c311 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -970,15 +970,34 @@ static Value *valueHasFloatPrecision(Value *Val) { return nullptr; } -//===----------------------------------------------------------------------===// -// Double -> Float Shrinking Optimizations for Unary Functions like 'floor' +/// Any floating-point library function that we're trying to simplify will have +/// a signature of the form: fptype foo(fptype param1, fptype param2, ...). +/// CheckDoubleTy indicates that 'fptype' must be 'double'. +static bool matchesFPLibFunctionSignature(const Function *F, unsigned NumParams, + bool CheckDoubleTy) { + FunctionType *FT = F->getFunctionType(); + if (FT->getNumParams() != NumParams) + return false; + + // The return type must match what we're looking for. + Type *RetTy = FT->getReturnType(); + if (CheckDoubleTy ? !RetTy->isDoubleTy() : !RetTy->isFloatingPointTy()) + return false; + + // Each parameter must match the return type, and therefore, match every other + // parameter too. + for (const Type *ParamTy : FT->params()) + if (ParamTy != RetTy) + return false; -Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, - bool CheckRetType) { + return true; +} + +/// Shrink double -> float for unary functions like 'floor'. +static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, + bool CheckRetType) { Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || - !FT->getParamType(0)->isDoubleTy()) + if (!matchesFPLibFunctionSignature(Callee, 1, true)) return nullptr; if (CheckRetType) { @@ -1013,15 +1032,10 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, return B.CreateFPExt(V, B.getDoubleTy()); } -// Double -> Float Shrinking Optimizations for Binary Functions like 'fmin/fmax' -Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) { +/// Shrink double -> float for binary functions like 'fmin/fmax'. +static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 2 arguments of the same FP type, which match the - // result type. - if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - !FT->getParamType(0)->isFloatingPointTy()) + if (!matchesFPLibFunctionSignature(Callee, 2, true)) return nullptr; // If this is something like 'fmin((double)floatval1, (double)floatval2)', @@ -1394,12 +1408,21 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - + Value *Ret = nullptr; if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" || Callee->getIntrinsicID() == Intrinsic::sqrt)) Ret = optimizeUnaryDoubleFP(CI, B, true); + // FIXME: Refactor - this check is repeated all over this file and even in the + // preceding call to shrink double -> float. + + // Make sure this has 1 argument of FP type, which matches the result type. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return Ret; + if (!CI->hasUnsafeAlgebra()) return Ret; diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll index a9ae00c..9996c0d 100644 --- a/test/CodeGen/AArch64/cxx-tlscc.ll +++ b/test/CodeGen/AArch64/cxx-tlscc.ll @@ -8,6 +8,7 @@ @sg = internal thread_local global %struct.S zeroinitializer, align 1 @__dso_handle = external global i8 @__tls_guard = internal thread_local unnamed_addr global i1 false +@sum1 = internal thread_local global i32 0, align 4 declare %struct.S* @_ZN1SC1Ev(%struct.S* returned) declare %struct.S* @_ZN1SD1Ev(%struct.S* returned) @@ -74,3 +75,29 @@ __tls_init.exit: ; CHECK-NOT: ldp d27, d26 ; CHECK-NOT: ldp d29, d28 ; CHECK-NOT: ldp d31, d30 + +; CHECK-LABEL: _ZTW4sum1 +; CHECK-NOT: stp d31, d30 +; CHECK-NOT: stp d29, d28 +; CHECK-NOT: stp d27, d26 +; CHECK-NOT: stp d25, d24 +; CHECK-NOT: stp d23, d22 +; CHECK-NOT: stp d21, d20 +; CHECK-NOT: stp d19, d18 +; CHECK-NOT: stp d17, d16 +; CHECK-NOT: stp d7, d6 +; CHECK-NOT: stp d5, d4 +; CHECK-NOT: stp d3, d2 +; CHECK-NOT: stp d1, d0 +; CHECK-NOT: stp x20, x19 +; CHECK-NOT: stp x14, x13 +; CHECK-NOT: stp x12, x11 +; CHECK-NOT: stp x10, x9 +; CHECK-NOT: stp x8, x7 +; CHECK-NOT: stp x6, x5 +; CHECK-NOT: stp x4, x3 +; CHECK-NOT: stp x2, x1 +; CHECK: blr +define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind { + ret i32* @sum1 +} diff --git a/test/CodeGen/ARM/cse-flags.ll b/test/CodeGen/ARM/cse-flags.ll new file mode 100644 index 0000000..c18e2fc --- /dev/null +++ b/test/CodeGen/ARM/cse-flags.ll @@ -0,0 +1,43 @@ +; RUN: llc -asm-verbose=false < %s | FileCheck %s +; PR26063 + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv7--linux-gnueabihf" + +; CHECK: .LBB0_1: +; CHECK-NEXT: bl f{{$}} +; CHECK-NEXT: ldrb r[[T0:[0-9]+]], [r{{[0-9]+}}, #1]!{{$}} +; CHECK-NEXT: cmp r{{[0-9]+}}, #1{{$}} +; CHECK-NEXT: cmpne r[[T0]], #0{{$}} +; CHECK-NEXT: bne .LBB0_1{{$}} +define i8* @h(i8* readonly %a, i32 %b, i32 %c) { +entry: + %0 = load i8, i8* %a, align 1 + %tobool4 = icmp ne i8 %0, 0 + %cmp5 = icmp ne i32 %b, 1 + %1 = and i1 %cmp5, %tobool4 + br i1 %1, label %while.body.preheader, label %while.end + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] + %call = tail call i32 bitcast (i32 (...)* @f to i32 ()*)() + %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1 + %2 = load i8, i8* %incdec.ptr, align 1 + %tobool = icmp ne i8 %2, 0 + %cmp = icmp ne i32 %call, 1 + %3 = and i1 %cmp, %tobool + br i1 %3, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + %incdec.ptr.lcssa = phi i8* [ %incdec.ptr, %while.body ] + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %a.addr.0.lcssa = phi i8* [ %a, %entry ], [ %incdec.ptr.lcssa, %while.end.loopexit ] + ret i8* %a.addr.0.lcssa +} + +declare i32 @f(...) diff --git a/test/CodeGen/ARM/cxx-tlscc.ll b/test/CodeGen/ARM/cxx-tlscc.ll index 7b776d4..11173bb 100644 --- a/test/CodeGen/ARM/cxx-tlscc.ll +++ b/test/CodeGen/ARM/cxx-tlscc.ll @@ -8,6 +8,7 @@ @sg = internal thread_local global %struct.S zeroinitializer, align 1 @__dso_handle = external global i8 @__tls_guard = internal thread_local unnamed_addr global i1 false +@sum1 = internal thread_local global i32 0, align 4 declare %struct.S* @_ZN1SC1Ev(%struct.S* returned) declare %struct.S* @_ZN1SD1Ev(%struct.S* returned) @@ -44,3 +45,13 @@ __tls_init.exit: ; CHECK-NOT: pop {r9, r12} ; CHECK-NOT: pop {r1, r2, r3, r4, r7, pc} ; CHECK: pop {lr} + +; CHECK-LABEL: _ZTW4sum1 +; CHECK-NOT: push {r1, r2, r3, r4, r7, lr} +; CHECK-NOT: push {r9, r12} +; CHECK-NOT: vpush {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31} +; CHECK-NOT: vpush {d0, d1, d2, d3, d4, d5, d6, d7} +; CHECK: blx +define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind { + ret i32* @sum1 +} diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll index 66743f3..46fef76 100644 --- a/test/CodeGen/ARM/memfunc.ll +++ b/test/CodeGen/ARM/memfunc.ll @@ -1,10 +1,10 @@ -; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS -; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN -; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI -; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI -; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI -; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI -; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI +; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS --check-prefix=CHECK +; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN --check-prefix=CHECK +; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK +; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK +; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK +; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK +; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK define void @f1(i8* %dest, i8* %src) { entry: @@ -402,8 +402,8 @@ entry: ; CHECK: arr1: ; CHECK-IOS: .align 3 ; CHECK-DARWIN: .align 2 -; CHECK-EABI: .align 2 -; CHECK-GNUEABI: .align 2 +; CHECK-EABI-NOT: .align +; CHECK-GNUEABI-NOT: .align ; CHECK: arr2: ; CHECK: {{\.section.+foo,bar}} ; CHECK-NOT: .align diff --git a/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll b/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll deleted file mode 100644 index 4580795..0000000 --- a/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s -march=x86-64 | FileCheck %s -; CHECK: addl - -; The two additions are the same , but have different flags. -; In theory this code should never be generated by the frontend, but this -; tries to test that two identical instructions with two different flags -; actually generate two different nodes. -; -; Normally the combiner would see this condition without the flags -; and optimize the result of the sub into a register clear -; (the final result would be 0). With the different flags though the combiner -; needs to keep the add + sub nodes, because the two nodes result as different -; nodes and so cannot assume that the subtraction of the two nodes -; generates 0 as result -define i32 @foo(i32 %a, i32 %b) { - %1 = add i32 %a, %b - %2 = add nsw i32 %a, %b - %3 = sub i32 %1, %2 - ret i32 %3 -} diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll index 70fe501..6c8e45e 100644 --- a/test/CodeGen/X86/cxx_tlscc64.ll +++ b/test/CodeGen/X86/cxx_tlscc64.ll @@ -4,11 +4,13 @@ ; tricks similar to AArch64 fast TLS calling convention (r255821). ; Applying tricks on x86-64 similar to r255821. ; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -O0 | FileCheck %s --check-prefix=CHECK-O0 %struct.S = type { i8 } @sg = internal thread_local global %struct.S zeroinitializer, align 1 @__dso_handle = external global i8 @__tls_guard = internal thread_local unnamed_addr global i1 false +@sum1 = internal thread_local global i32 0, align 4 declare void @_ZN1SC1Ev(%struct.S*) declare void @_ZN1SD1Ev(%struct.S*) @@ -50,3 +52,28 @@ init.i: __tls_init.exit: ret %struct.S* @sg } + +; CHECK-LABEL: _ZTW4sum1 +; CHECK-NOT: pushq %r11 +; CHECK-NOT: pushq %r10 +; CHECK-NOT: pushq %r9 +; CHECK-NOT: pushq %r8 +; CHECK-NOT: pushq %rsi +; CHECK-NOT: pushq %rdx +; CHECK-NOT: pushq %rcx +; CHECK-NOT: pushq %rbx +; CHECK: callq +define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind { + ret i32* @sum1 +} + +; Make sure at O0 we don't overwrite RBP. +; CHECK-O0-LABEL: _ZTW4sum2 +; CHECK-O0: pushq %rbp +; CHECK-O0: movq %rsp, %rbp +; CHECK-O0-NOT: movq %r{{.*}}, (%rbp) +define cxx_fast_tlscc i32* @_ZTW4sum2() #0 { + ret i32* @sum1 +} + +attributes #0 = { nounwind "no-frame-pointer-elim"="true" } diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll index 7c00f40..eb87f71 100644 --- a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll +++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll @@ -1,11 +1,5 @@ ; RUN: llc %s -o - | FileCheck %s --check-prefix=CHECK ; -; This test checks that we do not use shrink-wrapping when -; the function does not have any frame pointer and may unwind. -; This is a workaround for a limitation in the emission of -; the CFI directives, that are not correct in such case. -; PR25614 -; ; Note: This test cannot be merged with the shrink-wrapping tests ; because the booleans set on the command line take precedence on ; the target logic that disable shrink-wrapping. @@ -13,6 +7,12 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "x86_64-apple-macosx" +; This test checks that we do not use shrink-wrapping when +; the function does not have any frame pointer and may unwind. +; This is a workaround for a limitation in the emission of +; the CFI directives, that are not correct in such case. +; PR25614 +; ; No shrink-wrapping should occur here, until the CFI information are fixed. ; CHECK-LABEL: framelessUnwind: ; @@ -151,3 +151,74 @@ false: } attributes #2 = { "no-frame-pointer-elim"="false" nounwind } + + +; Check that we generate correct code for segmented stack. +; We used to emit the code at the entry point of the function +; instead of just before the prologue. +; For now, shrink-wrapping is disabled on segmented stack functions: PR26107. +; +; CHECK-LABEL: segmentedStack: +; CHECK: cmpq +; CHECK-NEXT: ja [[ENTRY_LABEL:LBB[0-9_]+]] +; +; CHECK: callq ___morestack +; CHECK-NEXT: retq +; +; CHECK: [[ENTRY_LABEL]]: +; Prologue +; CHECK: push +; +; In PR26107, we use to drop these two basic blocks, because +; the segmentedStack entry block was jumping directly to +; the place where the prologue is actually needed, which is +; the call to memcmp. +; Then, those two basic blocks did not have any predecessors +; anymore and were removed. +; +; Check if vk1 is null +; CHECK: testq %rdi, %rdi +; CHECK-NEXT: je [[STRINGS_EQUAL:LBB[0-9_]+]] +; +; Check if vk2 is null +; CHECK: testq %rsi, %rsi +; CHECK-NEXT: je [[STRINGS_EQUAL]] +; +; CHECK: [[STRINGS_EQUAL]] +; CHECK-NEXT: popq +define zeroext i1 @segmentedStack(i8* readonly %vk1, i8* readonly %vk2, i64 %key_size) #5 { +entry: + %cmp.i = icmp eq i8* %vk1, null + %cmp1.i = icmp eq i8* %vk2, null + %brmerge.i = or i1 %cmp.i, %cmp1.i + %cmp1.mux.i = and i1 %cmp.i, %cmp1.i + br i1 %brmerge.i, label %__go_ptr_strings_equal.exit, label %if.end4.i + +if.end4.i: ; preds = %entry + %tmp = getelementptr inbounds i8, i8* %vk1, i64 8 + %tmp1 = bitcast i8* %tmp to i64* + %tmp2 = load i64, i64* %tmp1, align 8 + %tmp3 = getelementptr inbounds i8, i8* %vk2, i64 8 + %tmp4 = bitcast i8* %tmp3 to i64* + %tmp5 = load i64, i64* %tmp4, align 8 + %cmp.i.i = icmp eq i64 %tmp2, %tmp5 + br i1 %cmp.i.i, label %land.rhs.i.i, label %__go_ptr_strings_equal.exit + +land.rhs.i.i: ; preds = %if.end4.i + %tmp6 = bitcast i8* %vk2 to i8** + %tmp7 = load i8*, i8** %tmp6, align 8 + %tmp8 = bitcast i8* %vk1 to i8** + %tmp9 = load i8*, i8** %tmp8, align 8 + %call.i.i = tail call i32 @memcmp(i8* %tmp9, i8* %tmp7, i64 %tmp2) #5 + %cmp4.i.i = icmp eq i32 %call.i.i, 0 + br label %__go_ptr_strings_equal.exit + +__go_ptr_strings_equal.exit: ; preds = %land.rhs.i.i, %if.end4.i, %entry + %retval.0.i = phi i1 [ %cmp1.mux.i, %entry ], [ false, %if.end4.i ], [ %cmp4.i.i, %land.rhs.i.i ] + ret i1 %retval.0.i +} + +; Function Attrs: nounwind readonly +declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) #5 + +attributes #5 = { nounwind readonly ssp uwtable "split-stack" } diff --git a/test/DebugInfo/ARM/PR26163.ll b/test/DebugInfo/ARM/PR26163.ll new file mode 100644 index 0000000..9ab0e35 --- /dev/null +++ b/test/DebugInfo/ARM/PR26163.ll @@ -0,0 +1,107 @@ +; RUN: llc -filetype=obj -o - < %s | llvm-dwarfdump - | FileCheck %s +; +; Checks that we're creating two ranges, one that terminates immediately +; and one that spans the rest of the function. This isn't necessarily the +; best thing to do here (and also not necessarily correct, since the first +; one has a bit_piece), but it is what is currently being emitted, any +; change here needs to be intentional, so the test is very specific. +; +; CHECK: .debug_loc contents: +; CHECK: 0x00000000: Beginning address offset: 0x0000000000000004 +; CHECK: Ending address offset: 0x0000000000000004 +; CHECK: Location description: 10 00 9f +; CHECK: Beginning address offset: 0x0000000000000004 +; CHECK: Ending address offset: 0x0000000000000014 +; CHECK: Location description: 10 00 9f + +; Created form the following test case (PR26163) with +; clang -cc1 -triple armv4t--freebsd11.0-gnueabi -emit-obj -debug-info-kind=standalone -O2 -x c test.c +; +; typedef unsigned int size_t; +; struct timeval { +; long long tv_sec; +; int tv_usec; +; }; +; +; void *memset(void *, int, size_t); +; void foo(void); +; +; static void +; bar(int value) +; { +; struct timeval lifetime; +; +; memset(&lifetime, 0, sizeof(struct timeval)); +; lifetime.tv_sec = value; +; +; foo(); +; } +; +; int +; parse_config_file(void) +; { +; int value; +; +; bar(value); +; return (0); +; } + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv4t--freebsd11.0-gnueabi" + +%struct.timeval = type { i64, i32 } + +declare void @llvm.dbg.declare(metadata, metadata, metadata) +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +declare void @foo() + +define i32 @parse_config_file() !dbg !4 { +entry: + tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !26), !dbg !27 + tail call void @llvm.dbg.declare(metadata %struct.timeval* undef, metadata !16, metadata !26), !dbg !29 + tail call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !16, metadata !30), !dbg !29 + tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !31), !dbg !29 + tail call void @foo() #3, !dbg !32 + ret i32 0, !dbg !33 +} + + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!22, !23, !24} +!llvm.ident = !{!25} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (https://github.com/llvm-mirror/clang 89dda3855cda574f355e6defa1d77bdae5053994) (llvm/trunk 257891)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3) +!1 = !DIFile(filename: "", directory: "/home/ubuntu/bugs") +!2 = !{} +!3 = !{!4, !11} +!4 = distinct !DISubprogram(name: "parse_config_file", scope: !5, file: !5, line: 22, type: !6, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: true, variables: !9) +!5 = !DIFile(filename: "test.c", directory: "/home/ubuntu/bugs") +!6 = !DISubroutineType(types: !7) +!7 = !{!8} +!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!9 = !{!10} +!10 = !DILocalVariable(name: "value", scope: !4, file: !5, line: 24, type: !8) +!11 = distinct !DISubprogram(name: "bar", scope: !5, file: !5, line: 11, type: !12, isLocal: true, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, variables: !14) +!12 = !DISubroutineType(types: !13) +!13 = !{null, !8} +!14 = !{!15, !16} +!15 = !DILocalVariable(name: "value", arg: 1, scope: !11, file: !5, line: 11, type: !8) +!16 = !DILocalVariable(name: "lifetime", scope: !11, file: !5, line: 13, type: !17) +!17 = !DICompositeType(tag: DW_TAG_structure_type, name: "timeval", file: !5, line: 2, size: 128, align: 64, elements: !18) +!18 = !{!19, !21} +!19 = !DIDerivedType(tag: DW_TAG_member, name: "tv_sec", scope: !17, file: !5, line: 3, baseType: !20, size: 64, align: 64) +!20 = !DIBasicType(name: "long long int", size: 64, align: 64, encoding: DW_ATE_signed) +!21 = !DIDerivedType(tag: DW_TAG_member, name: "tv_usec", scope: !17, file: !5, line: 4, baseType: !8, size: 32, align: 32, offset: 64) +!22 = !{i32 2, !"Debug Info Version", i32 3} +!23 = !{i32 1, !"wchar_size", i32 4} +!24 = !{i32 1, !"min_enum_size", i32 4} +!25 = !{!"clang version 3.9.0 (https://github.com/llvm-mirror/clang 89dda3855cda574f355e6defa1d77bdae5053994) (llvm/trunk 257891)"} +!26 = !DIExpression() +!27 = !DILocation(line: 11, scope: !11, inlinedAt: !28) +!28 = distinct !DILocation(line: 26, scope: !4) +!29 = !DILocation(line: 13, scope: !11, inlinedAt: !28) +!30 = !DIExpression(DW_OP_bit_piece, 0, 64) +!31 = !DIExpression(DW_OP_bit_piece, 0, 32) +!32 = !DILocation(line: 18, scope: !11, inlinedAt: !28) +!33 = !DILocation(line: 27, scope: !4) diff --git a/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll index 7df88b1..b91a043 100644 --- a/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll +++ b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll @@ -1,5 +1,5 @@ ; RUN: %lli -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 declare i32 @FB() diff --git a/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll index d35418b..94938a8 100644 --- a/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll +++ b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll @@ -1,5 +1,5 @@ ; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 declare i32 @FB() diff --git a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll index 0d1a1ec..72449f3 100644 --- a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll +++ b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 define i32 @bar() nounwind { ret i32 0 diff --git a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll index 31ed752..31271b5 100644 --- a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll +++ b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s -; XFAIL: win32 +; XFAIL: mingw32,win32 ; This test should fail until remote symbol resolution is supported. define i32 @main() nounwind { diff --git a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll index bbeab10..9d1abbc 100644 --- a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll +++ b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s -; XFAIL: win32 +; XFAIL: mingw32,win32 ; The intention of this test is to verify that symbols mapped to COMMON in ELF ; work as expected. diff --git a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll index 0aa19b2..afa8a95 100644 --- a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll +++ b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s -; XFAIL: win32 +; XFAIL: mingw32,win32 ; Check that a variable is always aligned as specified. diff --git a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll index 13bac29..f996159 100644 --- a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll +++ b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 define double @test(double* %DP, double %Arg) nounwind { %D = load double, double* %DP ; [#uses=1] diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll index 5d5480e..329dc5c 100644 --- a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll +++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 @count = global i32 1, align 4 diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll index ef74fa0..44557ea 100644 --- a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll +++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll @@ -1,6 +1,6 @@ ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \ ; RUN: -relocation-model=pic -code-model=small %s > /dev/null -; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32 +; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32 @count = global i32 1, align 4 diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll index c2260fc..a249c2f 100644 --- a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll +++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s -; XFAIL: win32 +; XFAIL: mingw32,win32 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4 diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll index 2a45472..2817053 100644 --- a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll +++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll @@ -1,6 +1,6 @@ ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \ ; RUN: -O0 -relocation-model=pic -code-model=small %s -; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32 +; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4 diff --git a/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll b/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll index 249aad2..6fbb2bc 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll @@ -1,5 +1,5 @@ ; RUN: %lli -jit-kind=orc-mcjit -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 declare i32 @FB() diff --git a/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll b/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll index 32c58ee..ce09417 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll @@ -1,5 +1,5 @@ ; RUN: %lli -jit-kind=orc-mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 declare i32 @FB() diff --git a/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll index aaf3ebc..bc477c2 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 define i32 @bar() nounwind { ret i32 0 diff --git a/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll index a0d9410..001a617 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s -; XFAIL: win32 +; XFAIL: mingw32,win32 ; This test should fail until remote symbol resolution is supported. define i32 @main() nounwind { diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll index 9b4e246..4c4256e 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s -; XFAIL: win32 +; XFAIL: mingw32,win32 ; The intention of this test is to verify that symbols mapped to COMMON in ELF ; work as expected. diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll index 88a561b..1621501 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s -; XFAIL: win32 +; XFAIL: mingw32,win32 ; Check that a variable is always aligned as specified. diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll index 484541a..6ff8704 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 define double @test(double* %DP, double %Arg) nounwind { %D = load double, double* %DP ; [#uses=1] diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll index adc3e94..a7c8bfe 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null -; XFAIL: win32 +; XFAIL: mingw32,win32 @count = global i32 1, align 4 diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll index 8ab3fd5..a028df6 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll @@ -1,6 +1,6 @@ ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \ ; RUN: -relocation-model=pic -code-model=small %s > /dev/null -; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32 +; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32 @count = global i32 1, align 4 diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll index a47c801..d369d2b 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll @@ -1,5 +1,5 @@ ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s -; XFAIL: win32 +; XFAIL: mingw32,win32 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4 diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll index 210ac6f..e918dab 100644 --- a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll +++ b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll @@ -1,6 +1,6 @@ ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \ ; RUN: -O0 -relocation-model=pic -code-model=small %s -; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32 +; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4 diff --git a/test/MC/AArch64/inst-directive.s b/test/MC/AArch64/inst-directive.s index 3bb620f..7fd5200 100644 --- a/test/MC/AArch64/inst-directive.s +++ b/test/MC/AArch64/inst-directive.s @@ -1,7 +1,14 @@ // RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=asm -o - \ // RUN: | FileCheck %s --check-prefix=CHECK-ASM -// RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=obj -o - \ -// RUN: | llvm-readobj -s -sd | FileCheck %s --check-prefix=CHECK-OBJ +// RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=obj -o %t +// RUN: llvm-readobj -s -sd %t | FileCheck %s --check-prefix=CHECK-OBJ +// RUN: llvm-objdump -t %t | FileCheck %s --check-prefix=CHECK-SYMS + +// RUN: llvm-mc %s -triple=aarch64_be-none-linux-gnu -filetype=asm -o - \ +// RUN: | FileCheck %s --check-prefix=CHECK-ASM +// RUN: llvm-mc %s -triple=aarch64_be-none-linux-gnu -filetype=obj -o %t +// RUN: llvm-readobj -s -sd %t | FileCheck %s --check-prefix=CHECK-OBJ +// RUN: llvm-objdump -t %t | FileCheck %s --check-prefix=CHECK-SYMS .section .inst.aarch64_inst @@ -22,3 +29,7 @@ aarch64_inst: // CHECK-OBJ: SectionData ( // CHECK-OBJ-NEXT: 0000: 2040105E // CHECK-OBJ-NEXT: ) + +// CHECK-SYMS-NOT: 0000000000000000 .inst.aarch64_inst 00000000 $d +// CHECK-SYMS: 0000000000000000 .inst.aarch64_inst 00000000 $x +// CHECK-SYMS-NOT: 0000000000000000 .inst.aarch64_inst 00000000 $d diff --git a/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll b/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll new file mode 100644 index 0000000..36440da --- /dev/null +++ b/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -loop-unroll -codegenprepare < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv7--linux-gnueabihf" + +; CHECK-LABEL: @f +define i32 @f(i32 %a) #0 { +; CHECK: call i32 @llvm.bitreverse.i32 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %or + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %b.07 = phi i32 [ 0, %entry ], [ %or, %for.body ] + %shr = lshr i32 %a, %i.08 + %and = and i32 %shr, 1 + %sub = sub nuw nsw i32 31, %i.08 + %shl = shl i32 %and, %sub + %or = or i32 %shl, %b.07 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, 32 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3 +} + +attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git b7441a0f42c43a8eea9e3e706be187252db747fa)"} +!3 = distinct !{!3, !4} +!4 = !{!"llvm.loop.unroll.full"} diff --git a/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg b/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg new file mode 100644 index 0000000..98c6700 --- /dev/null +++ b/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'ARM' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/CodeGenPrepare/bitreverse-hang.ll b/test/Transforms/CodeGenPrepare/bitreverse-hang.ll new file mode 100644 index 0000000..c81dcc1 --- /dev/null +++ b/test/Transforms/CodeGenPrepare/bitreverse-hang.ll @@ -0,0 +1,53 @@ +; RUN: opt < %s -loop-unroll -codegenprepare -S | FileCheck %s + +; This test is a worst-case scenario for bitreversal/byteswap detection. +; After loop unrolling (the unrolled loop is unreadably large so it has been kept +; rolled here), we have a binary tree of OR operands (as bitreversal detection +; looks straight through shifts): +; +; OR +; | \ +; | LSHR +; | / +; OR +; | \ +; | LSHR +; | / +; OR +; +; This results in exponential runtime. The loop here is 32 iterations which will +; totally hang if we don't deal with this case cleverly. + +@b = common global i32 0, align 4 + +; CHECK: define i32 @fn1 +define i32 @fn1() #0 { +entry: + %b.promoted = load i32, i32* @b, align 4, !tbaa !2 + br label %for.body + +for.body: ; preds = %for.body, %entry + %or4 = phi i32 [ %b.promoted, %entry ], [ %or, %for.body ] + %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %shr = lshr i32 %or4, 1 + %or = or i32 %shr, %or4 + %inc = add nuw nsw i32 %i.03, 1 + %exitcond = icmp eq i32 %inc, 32 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + store i32 %or, i32* @b, align 4, !tbaa !2 + ret i32 undef +} + +attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git eb70f4e9cc9a4dc3dd57b032fb858d56b4b64a0e)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} diff --git a/test/Transforms/Inline/inline-funclets.ll b/test/Transforms/Inline/inline-funclets.ll new file mode 100644 index 0000000..362e03d --- /dev/null +++ b/test/Transforms/Inline/inline-funclets.ll @@ -0,0 +1,455 @@ +; RUN: opt -inline -S %s | FileCheck %s + +declare void @g() + + +;;; Test with a call in a funclet that needs to remain a call +;;; when inlined because the funclet doesn't unwind to caller. +;;; CHECK-LABEL: define void @test1( +define void @test1() personality void ()* @g { +entry: +; CHECK-NEXT: entry: + invoke void @test1_inlinee() + to label %exit unwind label %cleanup +cleanup: + %pad = cleanuppad within none [] + call void @g() [ "funclet"(token %pad) ] + cleanupret from %pad unwind to caller +exit: + ret void +} + +define void @test1_inlinee() alwaysinline personality void ()* @g { +entry: + invoke void @g() + to label %exit unwind label %cleanup.inner +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: unwind label %[[cleanup_inner:.+]] + +cleanup.inner: + %pad.inner = cleanuppad within none [] + call void @g() [ "funclet"(token %pad.inner) ] + cleanupret from %pad.inner unwind label %cleanup.outer +; CHECK: [[cleanup_inner]]: +; The call here needs to remain a call becuase pad.inner has a cleanupret +; that stays within the inlinee. +; CHECK-NEXT: %[[pad_inner:[^ ]+]] = cleanuppad within none +; CHECK-NEXT: call void @g() [ "funclet"(token %[[pad_inner]]) ] +; CHECK-NEXT: cleanupret from %[[pad_inner]] unwind label %[[cleanup_outer:.+]] + +cleanup.outer: + %pad.outer = cleanuppad within none [] + call void @g() [ "funclet"(token %pad.outer) ] + cleanupret from %pad.outer unwind to caller +; CHECK: [[cleanup_outer]]: +; The call and cleanupret here need to be redirected to caller cleanup +; CHECK-NEXT: %[[pad_outer:[^ ]+]] = cleanuppad within none +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[pad_outer]]) ] +; CHECK-NEXT: unwind label %cleanup +; CHECK: cleanupret from %[[pad_outer]] unwind label %cleanup{{$}} + +exit: + ret void +} + + + +;;; Test with an "unwind to caller" catchswitch in a parent funclet +;;; that needs to remain "unwind to caller" because the parent +;;; doesn't unwind to caller. +;;; CHECK-LABEL: define void @test2( +define void @test2() personality void ()* @g { +entry: +; CHECK-NEXT: entry: + invoke void @test2_inlinee() + to label %exit unwind label %cleanup +cleanup: + %pad = cleanuppad within none [] + call void @g() [ "funclet"(token %pad) ] + cleanupret from %pad unwind to caller +exit: + ret void +} + +define void @test2_inlinee() alwaysinline personality void ()* @g { +entry: + invoke void @g() + to label %exit unwind label %cleanup1 +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: unwind label %[[cleanup1:.+]] + +cleanup1: + %outer = cleanuppad within none [] + invoke void @g() [ "funclet"(token %outer) ] + to label %ret1 unwind label %catchswitch +; CHECK: [[cleanup1]]: +; CHECK-NEXT: %[[outer:[^ ]+]] = cleanuppad within none +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[outer]]) ] +; CHECK-NEXT: unwind label %[[catchswitch:.+]] + +catchswitch: + %cs = catchswitch within %outer [label %catch] unwind to caller +; CHECK: [[catchswitch]]: +; The catchswitch here needs to remain "unwind to caller" since %outer +; has a cleanupret that remains within the inlinee. +; CHECK-NEXT: %[[cs:[^ ]+]] = catchswitch within %[[outer]] [label %[[catch:.+]]] unwind to caller + +catch: + %inner = catchpad within %cs [] + call void @g() [ "funclet"(token %inner) ] + catchret from %inner to label %ret1 +; CHECK: [[catch]]: +; The call here needs to remain a call since it too is within %outer +; CHECK: %[[inner:[^ ]+]] = catchpad within %[[cs]] +; CHECK-NEXT: call void @g() [ "funclet"(token %[[inner]]) ] + +ret1: + cleanupret from %outer unwind label %cleanup2 +; CHECK: cleanupret from %[[outer]] unwind label %[[cleanup2:.+]] + +cleanup2: + %later = cleanuppad within none [] + cleanupret from %later unwind to caller +; CHECK: [[cleanup2]]: +; The cleanupret here needs to get redirected to the caller cleanup +; CHECK-NEXT: %[[later:[^ ]+]] = cleanuppad within none +; CHECK-NEXT: cleanupret from %[[later]] unwind label %cleanup{{$}} + +exit: + ret void +} + + +;;; Test with a call in a cleanup that has no definitive unwind +;;; destination, that must be rewritten to an invoke. +;;; CHECK-LABEL: define void @test3( +define void @test3() personality void ()* @g { +entry: +; CHECK-NEXT: entry: + invoke void @test3_inlinee() + to label %exit unwind label %cleanup +cleanup: + %pad = cleanuppad within none [] + call void @g() [ "funclet"(token %pad) ] + cleanupret from %pad unwind to caller +exit: + ret void +} + +define void @test3_inlinee() alwaysinline personality void ()* @g { +entry: + invoke void @g() + to label %exit unwind label %cleanup +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: unwind label %[[cleanup:.+]] + +cleanup: + %pad = cleanuppad within none [] + call void @g() [ "funclet"(token %pad) ] + unreachable +; CHECK: [[cleanup]]: +; The call must be rewritten to an invoke targeting the caller cleanup +; because it may well unwind to there. +; CHECK-NEXT: %[[pad:[^ ]+]] = cleanuppad within none +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[pad]]) ] +; CHECK-NEXT: unwind label %cleanup{{$}} + +exit: + ret void +} + + +;;; Test with a catchswitch in a cleanup that has no definitive +;;; unwind destination, that must be rewritten to unwind to the +;;; inlined invoke's unwind dest +;;; CHECK-LABEL: define void @test4( +define void @test4() personality void ()* @g { +entry: +; CHECK-NEXT: entry: + invoke void @test4_inlinee() + to label %exit unwind label %cleanup +cleanup: + %pad = cleanuppad within none [] + call void @g() [ "funclet"(token %pad) ] + cleanupret from %pad unwind to caller +exit: + ret void +} + +define void @test4_inlinee() alwaysinline personality void ()* @g { +entry: + invoke void @g() + to label %exit unwind label %cleanup +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: unwind label %[[cleanup:.+]] + +cleanup: + %clean = cleanuppad within none [] + invoke void @g() [ "funclet"(token %clean) ] + to label %unreachable unwind label %dispatch +; CHECK: [[cleanup]]: +; CHECK-NEXT: %[[clean:[^ ]+]] = cleanuppad within none +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[clean]]) ] +; CHECK-NEXT: unwind label %[[dispatch:.+]] + +dispatch: + %cs = catchswitch within %clean [label %catch] unwind to caller +; CHECK: [[dispatch]]: +; The catchswitch must be rewritten to unwind to %cleanup in the caller +; because it may well unwind to there. +; CHECK-NEXT: %[[cs:[^ ]+]] = catchswitch within %[[clean]] [label %[[catch:.+]]] unwind label %cleanup{{$}} + +catch: + catchpad within %cs [] + br label %unreachable +unreachable: + unreachable +exit: + ret void +} + + +;;; Test with multiple levels of nesting, and unwind dests +;;; that need to be inferred from ancestors, descendants, +;;; and cousins. +;;; CHECK-LABEL: define void @test5( +define void @test5() personality void ()* @g { +entry: +; CHECK-NEXT: entry: + invoke void @test5_inlinee() + to label %exit unwind label %cleanup +cleanup: + %pad = cleanuppad within none [] + call void @g() [ "funclet"(token %pad) ] + cleanupret from %pad unwind to caller +exit: + ret void +} + +define void @test5_inlinee() alwaysinline personality void ()* @g { +entry: + invoke void @g() + to label %cont unwind label %noinfo.root +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: to label %[[cont:[^ ]+]] unwind label %[[noinfo_root:.+]] + +noinfo.root: + %noinfo.root.pad = cleanuppad within none [] + call void @g() [ "funclet"(token %noinfo.root.pad) ] + invoke void @g() [ "funclet"(token %noinfo.root.pad) ] + to label %noinfo.root.cont unwind label %noinfo.left +; CHECK: [[noinfo_root]]: +; Nothing under "noinfo.root" has a definitive unwind destination, so +; we must assume all of it may actually unwind, and redirect unwinds +; to the cleanup in the caller. +; CHECK-NEXT: %[[noinfo_root_pad:[^ ]+]] = cleanuppad within none [] +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_root_pad]]) ] +; CHECK-NEXT: to label %[[next:[^ ]+]] unwind label %cleanup{{$}} +; CHECK: [[next]]: +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_root_pad]]) ] +; CHECK-NEXT: to label %[[noinfo_root_cont:[^ ]+]] unwind label %[[noinfo_left:.+]] + +noinfo.left: + %noinfo.left.pad = cleanuppad within %noinfo.root.pad [] + invoke void @g() [ "funclet"(token %noinfo.left.pad) ] + to label %unreachable unwind label %noinfo.left.child +; CHECK: [[noinfo_left]]: +; CHECK-NEXT: %[[noinfo_left_pad:[^ ]+]] = cleanuppad within %[[noinfo_root_pad]] +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_left_pad]]) ] +; CHECK-NEXT: unwind label %[[noinfo_left_child:.+]] + +noinfo.left.child: + %noinfo.left.child.cs = catchswitch within %noinfo.left.pad [label %noinfo.left.child.catch] unwind to caller +; CHECK: [[noinfo_left_child]]: +; CHECK-NEXT: %[[noinfo_left_child_cs:[^ ]+]] = catchswitch within %[[noinfo_left_pad]] [label %[[noinfo_left_child_catch:[^ ]+]]] unwind label %cleanup{{$}} + +noinfo.left.child.catch: + %noinfo.left.child.pad = catchpad within %noinfo.left.child.cs [] + call void @g() [ "funclet"(token %noinfo.left.child.pad) ] + br label %unreachable +; CHECK: [[noinfo_left_child_catch]]: +; CHECK-NEXT: %[[noinfo_left_child_pad:[^ ]+]] = catchpad within %[[noinfo_left_child_cs]] [] +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_left_child_pad]]) ] +; CHECK-NEXT: unwind label %cleanup{{$}} + +noinfo.root.cont: + invoke void @g() [ "funclet"(token %noinfo.root.pad) ] + to label %unreachable unwind label %noinfo.right +; CHECK: [[noinfo_root_cont]]: +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_root_pad]]) ] +; CHECK-NEXT: unwind label %[[noinfo_right:.+]] + +noinfo.right: + %noinfo.right.cs = catchswitch within %noinfo.root.pad [label %noinfo.right.catch] unwind to caller +; CHECK: [[noinfo_right]]: +; CHECK-NEXT: %[[noinfo_right_cs:[^ ]+]] = catchswitch within %[[noinfo_root_pad]] [label %[[noinfo_right_catch:[^ ]+]]] unwind label %cleanup{{$}} + +noinfo.right.catch: + %noinfo.right.pad = catchpad within %noinfo.right.cs [] + invoke void @g() [ "funclet"(token %noinfo.right.pad) ] + to label %unreachable unwind label %noinfo.right.child +; CHECK: [[noinfo_right_catch]]: +; CHECK-NEXT: %[[noinfo_right_pad:[^ ]+]] = catchpad within %[[noinfo_right_cs]] +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_right_pad]]) ] +; CHECK-NEXT: unwind label %[[noinfo_right_child:.+]] + +noinfo.right.child: + %noinfo.right.child.pad = cleanuppad within %noinfo.right.pad [] + call void @g() [ "funclet"(token %noinfo.right.child.pad) ] + br label %unreachable +; CHECK: [[noinfo_right_child]]: +; CHECK-NEXT: %[[noinfo_right_child_pad:[^ ]+]] = cleanuppad within %[[noinfo_right_pad]] +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_right_child_pad]]) ] +; CHECK-NEXT: unwind label %cleanup{{$}} + +cont: + invoke void @g() + to label %exit unwind label %implicit.root +; CHECK: [[cont]]: +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: unwind label %[[implicit_root:.+]] + +implicit.root: + %implicit.root.pad = cleanuppad within none [] + call void @g() [ "funclet"(token %implicit.root.pad) ] + invoke void @g() [ "funclet"(token %implicit.root.pad) ] + to label %implicit.root.cont unwind label %implicit.left +; CHECK: [[implicit_root]]: +; There's an unwind edge to %internal in implicit.right, and we need to propagate that +; fact down to implicit.right.grandchild, up to implicit.root, and down to +; implicit.left.child.catch, leaving all calls and "unwind to caller" catchswitches +; alone to so they don't conflict with the unwind edge in implicit.right +; CHECK-NEXT: %[[implicit_root_pad:[^ ]+]] = cleanuppad within none +; CHECK-NEXT: call void @g() [ "funclet"(token %[[implicit_root_pad]]) ] +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_root_pad]]) ] +; CHECK-NEXT: to label %[[implicit_root_cont:[^ ]+]] unwind label %[[implicit_left:.+]] + +implicit.left: + %implicit.left.pad = cleanuppad within %implicit.root.pad [] + invoke void @g() [ "funclet"(token %implicit.left.pad) ] + to label %unreachable unwind label %implicit.left.child +; CHECK: [[implicit_left]]: +; CHECK-NEXT: %[[implicit_left_pad:[^ ]+]] = cleanuppad within %[[implicit_root_pad:[^ ]+]] +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_left_pad]]) ] +; CHECK-NEXT: unwind label %[[implicit_left_child:.+]] + +implicit.left.child: + %implicit.left.child.cs = catchswitch within %implicit.left.pad [label %implicit.left.child.catch] unwind to caller +; CHECK: [[implicit_left_child]]: +; CHECK-NEXT: %[[implicit_left_child_cs:[^ ]+]] = catchswitch within %[[implicit_left_pad]] [label %[[implicit_left_child_catch:[^ ]+]]] unwind to caller + +implicit.left.child.catch: + %implicit.left.child.pad = catchpad within %implicit.left.child.cs [] + call void @g() [ "funclet"(token %implicit.left.child.pad) ] + br label %unreachable +; CHECK: [[implicit_left_child_catch]]: +; CHECK-NEXT: %[[implicit_left_child_pad:[^ ]+]] = catchpad within %[[implicit_left_child_cs]] +; CHECK-NEXT: call void @g() [ "funclet"(token %[[implicit_left_child_pad]]) ] + +implicit.root.cont: + invoke void @g() [ "funclet"(token %implicit.root.pad) ] + to label %unreachable unwind label %implicit.right +; CHECK: [[implicit_root_cont]]: +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_root_pad]]) ] +; CHECK-NEXT: unwind label %[[implicit_right:.+]] + +implicit.right: + %implicit.right.cs = catchswitch within %implicit.root.pad [label %implicit.right.catch] unwind label %internal +; CHECK: [[implicit_right]]: +; This is the unwind edge (to %internal) whose existence needs to get propagated around the "implicit" tree +; CHECK-NEXT: %[[implicit_right_cs:[^ ]+]] = catchswitch within %[[implicit_root_pad]] [label %[[implicit_right_catch:[^ ]+]]] unwind label %[[internal:.+]] + +implicit.right.catch: + %implicit.right.pad = catchpad within %implicit.right.cs [] + invoke void @g() [ "funclet"(token %implicit.right.pad) ] + to label %unreachable unwind label %implicit.right.child +; CHECK: [[implicit_right_catch]]: +; CHECK-NEXT: %[[implicit_right_pad:[^ ]+]] = catchpad within %[[implicit_right_cs]] +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_right_pad]]) ] +; CHECK-NEXT: unwind label %[[implicit_right_child:.+]] + +implicit.right.child: + %implicit.right.child.pad = cleanuppad within %implicit.right.pad [] + invoke void @g() [ "funclet"(token %implicit.right.child.pad) ] + to label %unreachable unwind label %implicit.right.grandchild +; CHECK: [[implicit_right_child]]: +; CHECK-NEXT: %[[implicit_right_child_pad:[^ ]+]] = cleanuppad within %[[implicit_right_pad]] +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_right_child_pad]]) ] +; CHECK-NEXT: unwind label %[[implicit_right_grandchild:.+]] + +implicit.right.grandchild: + %implicit.right.grandchild.cs = catchswitch within %implicit.right.child.pad [label %implicit.right.grandchild.catch] unwind to caller +; CHECK: [[implicit_right_grandchild]]: +; CHECK-NEXT: %[[implicit_right_grandchild_cs:[^ ]+]] = catchswitch within %[[implicit_right_child_pad]] [label %[[implicit_right_grandchild_catch:[^ ]+]]] unwind to caller + +implicit.right.grandchild.catch: + %implicit.right.grandhcild.pad = catchpad within %implicit.right.grandchild.cs [] + call void @g() [ "funclet"(token %implicit.right.grandhcild.pad) ] + br label %unreachable +; CHECK: [[implicit_right_grandchild_catch]]: +; CHECK-NEXT: %[[implicit_right_grandhcild_pad:[^ ]+]] = catchpad within %[[implicit_right_grandchild_cs]] +; CHECK-NEXT: call void @g() [ "funclet"(token %[[implicit_right_grandhcild_pad]]) ] + +internal: + %internal.pad = cleanuppad within none [] + call void @g() [ "funclet"(token %internal.pad) ] + cleanupret from %internal.pad unwind to caller +; CHECK: [[internal]]: +; internal is a cleanup with a "return to caller" cleanuppad; that needs to get redirected +; to %cleanup in the caller, and the call needs to get similarly rewritten to an invoke. +; CHECK-NEXT: %[[internal_pad:[^ ]+]] = cleanuppad within none +; CHECK-NEXT: invoke void @g() [ "funclet"(token %internal.pad.i) ] +; CHECK-NEXT: to label %[[next:[^ ]+]] unwind label %cleanup{{$}} +; CHECK: [[next]]: +; CHECK-NEXT: cleanupret from %[[internal_pad]] unwind label %cleanup{{$}} + +unreachable: + unreachable +exit: + ret void +} + + +declare void @ProcessCLRException() + +; Make sure the logic doesn't get tripped up when the inlined invoke is +; itself within a funclet in the caller. +; CHECK-LABEL: define void @test6( +define void @test6() personality void ()* @ProcessCLRException { +entry: + invoke void @g() + to label %exit unwind label %callsite_parent +callsite_parent: + %callsite_parent.pad = cleanuppad within none [] +; CHECK: %callsite_parent.pad = cleanuppad within none + invoke void @test6_inlinee() [ "funclet"(token %callsite_parent.pad) ] + to label %ret unwind label %cleanup +ret: + cleanupret from %callsite_parent.pad unwind label %cleanup +cleanup: + %pad = cleanuppad within none [] + call void @g() [ "funclet"(token %pad) ] + cleanupret from %pad unwind to caller +exit: + ret void +} + +define void @test6_inlinee() alwaysinline personality void ()* @ProcessCLRException { +entry: + invoke void @g() + to label %exit unwind label %inlinee_cleanup +; CHECK-NEXT: invoke void @g() [ "funclet"(token %callsite_parent.pad) ] +; CHECK-NEXT: unwind label %[[inlinee_cleanup:.+]] + +inlinee_cleanup: + %inlinee.pad = cleanuppad within none [] + call void @g() [ "funclet"(token %inlinee.pad) ] + unreachable +; CHECK: [[inlinee_cleanup]]: +; CHECK-NEXT: %[[inlinee_pad:[^ ]+]] = cleanuppad within %callsite_parent.pad +; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[inlinee_pad]]) ] +; CHECK-NEXT: unwind label %cleanup{{$}} + +exit: + ret void +} diff --git a/test/Transforms/InstCombine/bitreverse-hang.ll b/test/Transforms/InstCombine/bitreverse-hang.ll new file mode 100644 index 0000000..6823bd0 --- /dev/null +++ b/test/Transforms/InstCombine/bitreverse-hang.ll @@ -0,0 +1,53 @@ +; RUN: opt < %s -loop-unroll -instcombine -S | FileCheck %s + +; This test is a worst-case scenario for bitreversal/byteswap detection. +; After loop unrolling (the unrolled loop is unreadably large so it has been kept +; rolled here), we have a binary tree of OR operands (as bitreversal detection +; looks straight through shifts): +; +; OR +; | \ +; | LSHR +; | / +; OR +; | \ +; | LSHR +; | / +; OR +; +; This results in exponential runtime. The loop here is 32 iterations which will +; totally hang if we don't deal with this case cleverly. + +@b = common global i32 0, align 4 + +; CHECK: define i32 @fn1 +define i32 @fn1() #0 { +entry: + %b.promoted = load i32, i32* @b, align 4, !tbaa !2 + br label %for.body + +for.body: ; preds = %for.body, %entry + %or4 = phi i32 [ %b.promoted, %entry ], [ %or, %for.body ] + %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %shr = lshr i32 %or4, 1 + %or = or i32 %shr, %or4 + %inc = add nuw nsw i32 %i.03, 1 + %exitcond = icmp eq i32 %inc, 32 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + store i32 %or, i32* @b, align 4, !tbaa !2 + ret i32 undef +} + +attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git eb70f4e9cc9a4dc3dd57b032fb858d56b4b64a0e)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} diff --git a/test/Transforms/InstCombine/bitreverse-recognize.ll b/test/Transforms/InstCombine/bitreverse-recognize.ll deleted file mode 100644 index fbd5cb6..0000000 --- a/test/Transforms/InstCombine/bitreverse-recognize.ll +++ /dev/null @@ -1,114 +0,0 @@ -; RUN: opt < %s -instcombine -S | FileCheck %s - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.10.0" - -define zeroext i8 @f_u8(i8 zeroext %a) { -; CHECK-LABEL: @f_u8 -; CHECK-NEXT: %[[A:.*]] = call i8 @llvm.bitreverse.i8(i8 %a) -; CHECK-NEXT: ret i8 %[[A]] - %1 = shl i8 %a, 7 - %2 = shl i8 %a, 5 - %3 = and i8 %2, 64 - %4 = shl i8 %a, 3 - %5 = and i8 %4, 32 - %6 = shl i8 %a, 1 - %7 = and i8 %6, 16 - %8 = lshr i8 %a, 1 - %9 = and i8 %8, 8 - %10 = lshr i8 %a, 3 - %11 = and i8 %10, 4 - %12 = lshr i8 %a, 5 - %13 = and i8 %12, 2 - %14 = lshr i8 %a, 7 - %15 = or i8 %14, %1 - %16 = or i8 %15, %3 - %17 = or i8 %16, %5 - %18 = or i8 %17, %7 - %19 = or i8 %18, %9 - %20 = or i8 %19, %11 - %21 = or i8 %20, %13 - ret i8 %21 -} - -; The ANDs with 32 and 64 have been swapped here, so the sequence does not -; completely match a bitreverse. -define zeroext i8 @f_u8_fail(i8 zeroext %a) { -; CHECK-LABEL: @f_u8_fail -; CHECK-NOT: call -; CHECK: ret i8 - %1 = shl i8 %a, 7 - %2 = shl i8 %a, 5 - %3 = and i8 %2, 32 - %4 = shl i8 %a, 3 - %5 = and i8 %4, 64 - %6 = shl i8 %a, 1 - %7 = and i8 %6, 16 - %8 = lshr i8 %a, 1 - %9 = and i8 %8, 8 - %10 = lshr i8 %a, 3 - %11 = and i8 %10, 4 - %12 = lshr i8 %a, 5 - %13 = and i8 %12, 2 - %14 = lshr i8 %a, 7 - %15 = or i8 %14, %1 - %16 = or i8 %15, %3 - %17 = or i8 %16, %5 - %18 = or i8 %17, %7 - %19 = or i8 %18, %9 - %20 = or i8 %19, %11 - %21 = or i8 %20, %13 - ret i8 %21 -} - -define zeroext i16 @f_u16(i16 zeroext %a) { -; CHECK-LABEL: @f_u16 -; CHECK-NEXT: %[[A:.*]] = call i16 @llvm.bitreverse.i16(i16 %a) -; CHECK-NEXT: ret i16 %[[A]] - %1 = shl i16 %a, 15 - %2 = shl i16 %a, 13 - %3 = and i16 %2, 16384 - %4 = shl i16 %a, 11 - %5 = and i16 %4, 8192 - %6 = shl i16 %a, 9 - %7 = and i16 %6, 4096 - %8 = shl i16 %a, 7 - %9 = and i16 %8, 2048 - %10 = shl i16 %a, 5 - %11 = and i16 %10, 1024 - %12 = shl i16 %a, 3 - %13 = and i16 %12, 512 - %14 = shl i16 %a, 1 - %15 = and i16 %14, 256 - %16 = lshr i16 %a, 1 - %17 = and i16 %16, 128 - %18 = lshr i16 %a, 3 - %19 = and i16 %18, 64 - %20 = lshr i16 %a, 5 - %21 = and i16 %20, 32 - %22 = lshr i16 %a, 7 - %23 = and i16 %22, 16 - %24 = lshr i16 %a, 9 - %25 = and i16 %24, 8 - %26 = lshr i16 %a, 11 - %27 = and i16 %26, 4 - %28 = lshr i16 %a, 13 - %29 = and i16 %28, 2 - %30 = lshr i16 %a, 15 - %31 = or i16 %30, %1 - %32 = or i16 %31, %3 - %33 = or i16 %32, %5 - %34 = or i16 %33, %7 - %35 = or i16 %34, %9 - %36 = or i16 %35, %11 - %37 = or i16 %36, %13 - %38 = or i16 %37, %15 - %39 = or i16 %38, %17 - %40 = or i16 %39, %19 - %41 = or i16 %40, %21 - %42 = or i16 %41, %23 - %43 = or i16 %42, %25 - %44 = or i16 %43, %27 - %45 = or i16 %44, %29 - ret i16 %45 -} \ No newline at end of file diff --git a/test/Transforms/InstCombine/cos-2.ll b/test/Transforms/InstCombine/cos-2.ll index c9a9c7c..a85cc8f 100644 --- a/test/Transforms/InstCombine/cos-2.ll +++ b/test/Transforms/InstCombine/cos-2.ll @@ -1,12 +1,11 @@ -; Test that the cos library call simplifier works correctly. -; ; RUN: opt < %s -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" declare float @cos(double) +declare signext i8 @sqrt(...) -; Check that cos functions with the wrong prototype aren't simplified. +; Check that functions with the wrong prototype aren't simplified. define float @test_no_simplify1(double %d) { ; CHECK-LABEL: @test_no_simplify1( @@ -15,3 +14,14 @@ define float @test_no_simplify1(double %d) { ; CHECK: call float @cos(double %neg) ret float %cos } + + +define i8 @bogus_sqrt() { + %fake_sqrt = call signext i8 (...) @sqrt() + ret i8 %fake_sqrt + +; CHECK-LABEL: bogus_sqrt( +; CHECK-NEXT: %fake_sqrt = call signext i8 (...) @sqrt() +; CHECK-NEXT: ret i8 %fake_sqrt +} + diff --git a/test/Transforms/InstCombine/double-float-shrink-1.ll b/test/Transforms/InstCombine/double-float-shrink-1.ll index 319ea32..74f3ebb 100644 --- a/test/Transforms/InstCombine/double-float-shrink-1.ll +++ b/test/Transforms/InstCombine/double-float-shrink-1.ll @@ -364,6 +364,26 @@ define float @max1(float %a, float %b) { ; CHECK-NEXT: ret } +; A function can have a name that matches a common libcall, +; but with the wrong type(s). Let it be. + +define float @fake_fmin(float %a, float %b) { + %c = fpext float %a to fp128 + %d = fpext float %b to fp128 + %e = call fp128 @fmin(fp128 %c, fp128 %d) + %f = fptrunc fp128 %e to float + ret float %f + +; CHECK-LABEL: fake_fmin( +; CHECK-NEXT: %c = fpext float %a to fp128 +; CHECK-NEXT: %d = fpext float %b to fp128 +; CHECK-NEXT: %e = call fp128 @fmin(fp128 %c, fp128 %d) +; CHECK-NEXT: %f = fptrunc fp128 %e to float +; CHECK-NEXT: ret float %f +} + +declare fp128 @fmin(fp128, fp128) ; This is not the 'fmin' you're looking for. + declare double @fmax(double, double) declare double @tanh(double) #1 diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp index 67e7cbd..a76ec11 100644 --- a/tools/lli/lli.cpp +++ b/tools/lli/lli.cpp @@ -16,6 +16,7 @@ #include "OrcLazyJIT.h" #include "RemoteJITUtils.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/Bitcode/ReaderWriter.h" #include "llvm/CodeGen/LinkAllCodegenComponents.h" @@ -741,11 +742,11 @@ std::unique_ptr launchRemote() { ChildPath.reset(new char[ChildExecPath.size() + 1]); std::copy(ChildExecPath.begin(), ChildExecPath.end(), &ChildPath[0]); ChildPath[ChildExecPath.size()] = '\0'; - std::string ChildInStr = std::to_string(PipeFD[0][0]); + std::string ChildInStr = utostr(PipeFD[0][0]); ChildIn.reset(new char[ChildInStr.size() + 1]); std::copy(ChildInStr.begin(), ChildInStr.end(), &ChildIn[0]); ChildIn[ChildInStr.size()] = '\0'; - std::string ChildOutStr = std::to_string(PipeFD[1][1]); + std::string ChildOutStr = utostr(PipeFD[1][1]); ChildOut.reset(new char[ChildOutStr.size() + 1]); std::copy(ChildOutStr.begin(), ChildOutStr.end(), &ChildOut[0]); ChildOut[ChildOutStr.size()] = '\0'; diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh index fb50160..c3884ba 100755 --- a/utils/release/test-release.sh +++ b/utils/release/test-release.sh @@ -65,11 +65,6 @@ function usage() { echo " -no-openmp Disable check-out & build libomp" } -if [ `uname -s` = "Darwin" ]; then - # compiler-rt doesn't yet build with CMake on Darwin. - use_autoconf="yes" -fi - while [ $# -gt 0 ]; do case $1 in -release | --release ) @@ -288,10 +283,20 @@ function export_sources() { if [ ! -h clang ]; then ln -s ../../cfe.src clang fi - cd $BuildDir/llvm.src/tools/clang/tools - if [ ! -h extra ]; then - ln -s ../../../../clang-tools-extra.src extra + + # The autoconf and CMake builds want different symlinks here: + if [ "$use_autoconf" = "yes" ]; then + cd $BuildDir/llvm.src/tools/clang/tools + if [ ! -h extra ]; then + ln -s ../../../../clang-tools-extra.src extra + fi + else + cd $BuildDir/cfe.src/tools + if [ ! -h extra ]; then + ln -s ../../clang-tools-extra.src extra + fi fi + cd $BuildDir/llvm.src/projects if [ -d $BuildDir/test-suite.src ] && [ ! -h test-suite ]; then ln -s ../../test-suite.src test-suite -- cgit v1.1 From 44c4732640f764c943d7814138396141c0f4646b Mon Sep 17 00:00:00 2001 From: dim Date: Wed, 27 Jan 2016 21:08:51 +0000 Subject: Vendor import of llvm release_38 branch r258968: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258968 --- include/llvm/Analysis/ScalarEvolution.h | 6 +- lib/Analysis/DemandedBits.cpp | 2 +- lib/Analysis/ScalarEvolution.cpp | 8 +++ lib/Target/AMDGPU/AMDGPU.td | 5 ++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 2 +- lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 ++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 + lib/Target/X86/X86ISelLowering.cpp | 3 +- lib/Transforms/Instrumentation/GCOVProfiling.cpp | 10 ++++ test/Analysis/DemandedBits/basic.ll | 13 ++++- test/CodeGen/X86/cmovcmov.ll | 49 ++++++++++++++++ test/Transforms/GCOVProfiling/modules.ll | 12 ++++ test/Transforms/IndVarSimplify/pr26207.ll | 20 +++++++ utils/release/test-release.sh | 71 +++++++++--------------- 14 files changed, 157 insertions(+), 51 deletions(-) create mode 100644 test/Transforms/GCOVProfiling/modules.ll create mode 100644 test/Transforms/IndVarSimplify/pr26207.ll diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h index c08335d..ef93057 100644 --- a/include/llvm/Analysis/ScalarEvolution.h +++ b/include/llvm/Analysis/ScalarEvolution.h @@ -412,7 +412,11 @@ namespace llvm { /*implicit*/ ExitLimit(const SCEV *E) : Exact(E), Max(E) {} - ExitLimit(const SCEV *E, const SCEV *M) : Exact(E), Max(M) {} + ExitLimit(const SCEV *E, const SCEV *M) : Exact(E), Max(M) { + assert((isa(Exact) || + !isa(Max)) && + "Exact is not allowed to be less precise than Max"); + } /// Test whether this ExitLimit contains any computed information, or /// whether it's all SCEVCouldNotCompute values. diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp index 912c5ce..143d0b7 100644 --- a/lib/Analysis/DemandedBits.cpp +++ b/lib/Analysis/DemandedBits.cpp @@ -244,7 +244,7 @@ void DemandedBits::determineLiveOperandBits( break; case Instruction::ICmp: // Count the number of leading zeroes in each operand. - ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1)); auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(), KnownZero2.countLeadingOnes()); AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes); diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 34074ef..ef1bb3a 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -5368,6 +5368,14 @@ ScalarEvolution::computeExitLimitFromCond(const Loop *L, BECount = EL0.Exact; } + // There are cases (e.g. PR26207) where computeExitLimitFromCond is able + // to be more aggressive when computing BECount than when computing + // MaxBECount. In these cases it is possible for EL0.Exact and EL1.Exact + // to match, but for EL0.Max and EL1.Max to not. + if (isa(MaxBECount) && + !isa(BECount)) + MaxBECount = BECount; + return ExitLimit(BECount, MaxBECount); } if (BO->getOpcode() == Instruction::Or) { diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index db869cf..79c6604 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -138,6 +138,11 @@ def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer", "true", "Enable scratch buffer sizes greater than 128 GB">; +def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", + "EnableSIScheduler", + "true", + "Enable SI Machine Scheduler">; + class SubtargetFeatureFetchLimit : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index c6af5b9..7d70fa7 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -78,7 +78,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), - FrameLowering(nullptr), + EnableSIScheduler(false), FrameLowering(nullptr), InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { initializeSubtargetDependencies(TT, GPU, FS); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index d371227..4796e9e 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -90,6 +90,7 @@ private: int LDSBankCount; unsigned IsaVersion; bool EnableHugeScratchBuffer; + bool EnableSIScheduler; std::unique_ptr FrameLowering; std::unique_ptr TLInfo; @@ -280,6 +281,10 @@ public: return EnableHugeScratchBuffer; } + bool enableSIScheduler() const { + return EnableSIScheduler; + } + bool dumpCode() const { return DumpCode; } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b1be619..519ae5c 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -147,6 +147,8 @@ public: const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) return createR600MachineScheduler(C); + else if (ST.enableSIScheduler()) + return createSIMachineScheduler(C); return nullptr; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6904714..34f3919 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -21880,7 +21880,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, if (LastCMOV == MI && NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg() && + NextMIIt->getOperand(1).isKill()) { CascadedCMOV = &*NextMIIt; } diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index fa939ae..ffde7f8 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -494,6 +494,11 @@ void GCOVProfiler::emitProfileNotes() { // LTO, we'll generate the same .gcno files. auto *CU = cast(CU_Nodes->getOperand(i)); + + // Skip module skeleton (and module) CUs. + if (CU->getDWOId()) + continue; + std::error_code EC; raw_fd_ostream out(mangleName(CU, "gcno"), EC, sys::fs::F_None); std::string EdgeDestinations; @@ -853,6 +858,11 @@ Function *GCOVProfiler::insertCounterWriteout( if (CU_Nodes) { for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { auto *CU = cast(CU_Nodes->getOperand(i)); + + // Skip module skeleton (and module) CUs. + if (CU->getDWOId()) + continue; + std::string FilenameGcda = mangleName(CU, "gcda"); uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i]; Builder.CreateCall(StartFile, diff --git a/test/Analysis/DemandedBits/basic.ll b/test/Analysis/DemandedBits/basic.ll index 487e522..9973edf 100644 --- a/test/Analysis/DemandedBits/basic.ll +++ b/test/Analysis/DemandedBits/basic.ll @@ -24,11 +24,20 @@ define i1 @test_icmp1(i32 %a, i32 %b) { ; CHECK-LABEL: 'test_icmp2' ; CHECK-DAG: DemandedBits: 0x1 for %3 = icmp eq i32 %1, %2 -; CHECK-DAG: DemandedBits: 0xFF for %1 = and i32 %a, 255 -; CHECK-DAG: DemandedBits: 0xF for %2 = ashr i32 %1, 4 +; CHECK-DAG: DemandedBits: 0xFFF for %1 = and i32 %a, 255 +; CHECK-DAG: DemandedBits: 0xFF for %2 = ashr i32 %1, 4 define i1 @test_icmp2(i32 %a, i32 %b) { %1 = and i32 %a, 255 %2 = ashr i32 %1, 4 %3 = icmp eq i32 %1, %2 ret i1 %3 } + +; CHECK-LABEL: 'test_icmp3' +; CHECK-DAG: DemandedBits: 0xFFFFFFFF for %1 = and i32 %a, 255 +; CHECK-DAG: DemandedBits: 0x1 for %2 = icmp eq i32 -1, %1 +define i1 @test_icmp3(i32 %a) { + %1 = and i32 %a, 255 + %2 = icmp eq i32 -1, %1 + ret i1 %2 +} diff --git a/test/CodeGen/X86/cmovcmov.ll b/test/CodeGen/X86/cmovcmov.ll index d3d9748..9363d31 100644 --- a/test/CodeGen/X86/cmovcmov.ll +++ b/test/CodeGen/X86/cmovcmov.ll @@ -224,3 +224,52 @@ entry: } attributes #0 = { nounwind } + +@g8 = global i8 0 + +; The following test failed because llvm had a bug where a structure like: +; +; %vreg12 = CMOV_GR8 %vreg7, %vreg11 ... (lt) +; %vreg13 = CMOV_GR8 %vreg12, %vreg11 ... (gt) +; +; was lowered to: +; +; The first two cmovs got expanded to: +; BB#0: +; JL_1 BB#9 +; BB#7: +; JG_1 BB#9 +; BB#8: +; BB#9: +; vreg12 = phi(vreg7, BB#8, vreg11, BB#0, vreg12, BB#7) +; vreg13 = COPY vreg12 +; Which was invalid as %vreg12 is not the same value as %vreg13 + +; CHECK-LABEL: no_cascade_opt: +; CMOV-DAG: cmpl %edx, %esi +; CMOV-DAG: movb $20, %al +; CMOV-DAG: movb $20, %dl +; CMOV: jl [[BB0:.LBB[0-9_]+]] +; CMOV: movb %cl, %dl +; CMOV: [[BB0]]: +; CMOV: jg [[BB1:.LBB[0-9_]+]] +; CMOV: movb %dl, %al +; CMOV: [[BB1]]: +; CMOV: testl %edi, %edi +; CMOV: je [[BB2:.LBB[0-9_]+]] +; CMOV: movb %dl, %al +; CMOV: [[BB2]]: +; CMOV: movb %al, g8(%rip) +; CMOV: retq +define void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) { +entry: + %c0 = icmp eq i32 %v0, 0 + %c1 = icmp slt i32 %v1, %v2 + %c2 = icmp sgt i32 %v1, %v2 + %trunc = trunc i32 %v3 to i8 + %sel0 = select i1 %c1, i8 20, i8 %trunc + %sel1 = select i1 %c2, i8 20, i8 %sel0 + %sel2 = select i1 %c0, i8 %sel1, i8 %sel0 + store volatile i8 %sel2, i8* @g8 + ret void +} diff --git a/test/Transforms/GCOVProfiling/modules.ll b/test/Transforms/GCOVProfiling/modules.ll new file mode 100644 index 0000000..1a8edfe --- /dev/null +++ b/test/Transforms/GCOVProfiling/modules.ll @@ -0,0 +1,12 @@ +; RUN: opt -insert-gcov-profiling -o - < %s | llvm-dis | FileCheck -check-prefix=EMIT-ARCS %s + +; EMIT-ARCS-NOT: call void @llvm_gcda_start_file + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !2, globals: !2, imports: !2, dwoId: 43981) +!1 = !DIFile(filename: "", directory: "/") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/test/Transforms/IndVarSimplify/pr26207.ll b/test/Transforms/IndVarSimplify/pr26207.ll new file mode 100644 index 0000000..9d351e0 --- /dev/null +++ b/test/Transforms/IndVarSimplify/pr26207.ll @@ -0,0 +1,20 @@ +; RUN: opt -S -indvars < %s | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +define void @main(i16 %in) { +; CHECK-LABEL: @main( + br label %bb2 + +bb2: ; preds = %bb1.i, %bb2, %0 + %_tmp44.i = icmp slt i16 %in, 2 + br i1 %_tmp44.i, label %bb1.i, label %bb2 + +bb1.i: ; preds = %bb1.i, %bb2 + %_tmp25.i = phi i16 [ %in, %bb2 ], [ %_tmp6.i, %bb1.i ] + %_tmp6.i = add nsw i16 %_tmp25.i, 1 + %_tmp10.i = icmp sge i16 %_tmp6.i, 2 + %exitcond.i = icmp eq i16 %_tmp6.i, 2 + %or.cond = and i1 %_tmp10.i, %exitcond.i + br i1 %or.cond, label %bb2, label %bb1.i +} diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh index c3884ba..c5fe631 100755 --- a/utils/release/test-release.sh +++ b/utils/release/test-release.sh @@ -267,56 +267,36 @@ function export_sources() { check_valid_urls for proj in $projects ; do - if [ -d $proj.src ]; then - echo "# Reusing $proj $Release-$RC sources" + case $proj in + llvm) + projsrc=$proj.src + ;; + cfe) + projsrc=llvm.src/tools/clang + ;; + clang-tools-extra) + projsrc=llvm.src/tools/clang/tools/extra + ;; + compiler-rt|libcxx|libcxxabi|libunwind|openmp|test-suite) + projsrc=llvm.src/projects/$proj + ;; + *) + echo "error: unknown project $proj" + exit 1 + ;; + esac + + if [ -d $projsrc ]; then + echo "# Reusing $proj $Release-$RC sources in $projsrc" continue fi - echo "# Exporting $proj $Release-$RC sources" - if ! svn export -q $Base_url/$proj/$ExportBranch $proj.src ; then + echo "# Exporting $proj $Release-$RC sources to $projsrc" + if ! svn export -q $Base_url/$proj/$ExportBranch $projsrc ; then echo "error: failed to export $proj project" exit 1 fi done - echo "# Creating symlinks" - cd $BuildDir/llvm.src/tools - if [ ! -h clang ]; then - ln -s ../../cfe.src clang - fi - - # The autoconf and CMake builds want different symlinks here: - if [ "$use_autoconf" = "yes" ]; then - cd $BuildDir/llvm.src/tools/clang/tools - if [ ! -h extra ]; then - ln -s ../../../../clang-tools-extra.src extra - fi - else - cd $BuildDir/cfe.src/tools - if [ ! -h extra ]; then - ln -s ../../clang-tools-extra.src extra - fi - fi - - cd $BuildDir/llvm.src/projects - if [ -d $BuildDir/test-suite.src ] && [ ! -h test-suite ]; then - ln -s ../../test-suite.src test-suite - fi - if [ -d $BuildDir/compiler-rt.src ] && [ ! -h compiler-rt ]; then - ln -s ../../compiler-rt.src compiler-rt - fi - if [ -d $BuildDir/openmp.src ] && [ ! -h openmp ]; then - ln -s ../../openmp.src openmp - fi - if [ -d $BuildDir/libcxx.src ] && [ ! -h libcxx ]; then - ln -s ../../libcxx.src libcxx - fi - if [ -d $BuildDir/libcxxabi.src ] && [ ! -h libcxxabi ]; then - ln -s ../../libcxxabi.src libcxxabi - fi - if [ -d $BuildDir/libunwind.src ] && [ ! -h libunwind ]; then - ln -s ../../libunwind.src libunwind - fi - cd $BuildDir } @@ -560,8 +540,9 @@ for Flavor in $Flavors ; do # Substitute 'Phase2' for 'Phase3' in the Phase 2 object file in # case there are build paths in the debug info. On some systems, # sed adds a newline to the output, so pass $p3 through sed too. - if ! cmp -s <(sed -e 's,Phase2,Phase3,g' $p2) <(sed -e '' $p3) \ - 16 16 ; then + if ! cmp -s \ + <(env LC_CTYPE=C sed -e 's,Phase2,Phase3,g' $p2) \ + <(env LC_CTYPE=C sed -e '' $p3) 16 16; then echo "file `basename $p2` differs between phase 2 and phase 3" fi done -- cgit v1.1 From 97a7b8a20a989eb4cf3d9465e1451de6cd05fa41 Mon Sep 17 00:00:00 2001 From: dim Date: Sat, 13 Feb 2016 14:57:10 +0000 Subject: Vendor import of llvm release_38 branch r260756: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260756 --- cmake/modules/AddLLVM.cmake | 36 +-- cmake/modules/LLVM-Config.cmake | 11 +- docs/ReleaseNotes.rst | 148 ++++++++- include/llvm/IR/IntrinsicsPowerPC.td | 2 +- include/llvm/IR/Value.h | 4 - lib/Analysis/DemandedBits.cpp | 7 - lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp | 5 + lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 55 +++- lib/IR/Value.cpp | 4 +- lib/Target/AArch64/AArch64.td | 4 +- lib/Target/AArch64/AArch64ISelLowering.cpp | 3 + lib/Target/AArch64/AArch64SchedM1.td | 359 +++++++++++++++++++++ lib/Target/AMDGPU/AMDGPU.td | 3 +- lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +- lib/Target/AMDGPU/Processors.td | 12 +- lib/Target/AMDGPU/SIRegisterInfo.cpp | 9 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 3 + lib/Target/ARM/ARMISelDAGToDAG.cpp | 4 +- lib/Target/PowerPC/PPCFastISel.cpp | 18 +- lib/Target/PowerPC/PPCInstrAltivec.td | 2 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 2 +- lib/Target/X86/X86ISelLowering.cpp | 65 ++-- lib/Transforms/InstCombine/InstCombineCompares.cpp | 2 +- .../InstCombine/InstCombineLoadStoreAlloca.cpp | 3 +- .../InstCombine/InstCombineVectorOps.cpp | 18 +- lib/Transforms/Utils/SimplifyCFG.cpp | 12 + test/Analysis/DemandedBits/basic.ll | 31 -- test/CodeGen/AArch64/fp16-v4-instructions.ll | 274 ++++++++++++++++ test/CodeGen/AArch64/fp16-v8-instructions.ll | 84 +++++ test/CodeGen/AMDGPU/hsa-note-no-func.ll | 2 + test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll | 1 + test/CodeGen/AMDGPU/spill-scavenge-offset.ll | 33 ++ test/CodeGen/ARM/shifter_operand.ll | 17 + test/CodeGen/PowerPC/fast-isel-ret.ll | 9 + test/CodeGen/PowerPC/inline-asm-s-modifier.ll | 10 + test/CodeGen/PowerPC/pr26193.ll | 9 + test/CodeGen/PowerPC/pr26356.ll | 136 ++++++++ test/CodeGen/PowerPC/pr26381.ll | 8 + test/CodeGen/SystemZ/int-cmp-53.ll | 26 ++ test/CodeGen/X86/avx512-gather-scatter-intrin.ll | 63 +++- test/CodeGen/X86/setcc-lowering.ll | 79 ++++- test/DebugInfo/X86/PR26148.ll | 102 ++++++ test/Transforms/InstCombine/icmp.ll | 12 + .../InstCombine/insert-extract-shuffle.ll | 30 ++ test/Transforms/InstCombine/unpack-fca.ll | 15 + .../AArch64/loop-vectorization-factors.ll | 34 -- .../SimplifyCFG/X86/switch_to_lookup_table.ll | 32 ++ tools/CMakeLists.txt | 2 +- utils/release/test-release.sh | 26 +- utils/unittest/CMakeLists.txt | 7 +- utils/unittest/UnitTestMain/CMakeLists.txt | 4 +- 51 files changed, 1611 insertions(+), 229 deletions(-) create mode 100644 lib/Target/AArch64/AArch64SchedM1.td create mode 100644 test/CodeGen/AMDGPU/spill-scavenge-offset.ll create mode 100644 test/CodeGen/PowerPC/inline-asm-s-modifier.ll create mode 100644 test/CodeGen/PowerPC/pr26193.ll create mode 100644 test/CodeGen/PowerPC/pr26356.ll create mode 100644 test/CodeGen/PowerPC/pr26381.ll create mode 100644 test/CodeGen/SystemZ/int-cmp-53.ll create mode 100644 test/DebugInfo/X86/PR26148.ll diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake index b06e434..a829751 100755 --- a/cmake/modules/AddLLVM.cmake +++ b/cmake/modules/AddLLVM.cmake @@ -468,20 +468,23 @@ function(llvm_add_library name) endif() endif() - # Add the explicit dependency information for this library. - # - # It would be nice to verify that we have the dependencies for this library - # name, but using get_property(... SET) doesn't suffice to determine if a - # property has been set to an empty value. - get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name}) - - if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_STATIC AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB) - set(llvm_libs LLVM) + if (DEFINED LLVM_LINK_COMPONENTS OR DEFINED ARG_LINK_COMPONENTS) + if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB) + set(llvm_libs LLVM) + else() + llvm_map_components_to_libnames(llvm_libs + ${ARG_LINK_COMPONENTS} + ${LLVM_LINK_COMPONENTS} + ) + endif() else() - llvm_map_components_to_libnames(llvm_libs - ${ARG_LINK_COMPONENTS} - ${LLVM_LINK_COMPONENTS} - ) + # Components have not been defined explicitly in CMake, so add the + # dependency information for this library as defined by LLVMBuild. + # + # It would be nice to verify that we have the dependencies for this library + # name, but using get_property(... SET) doesn't suffice to determine if a + # property has been set to an empty value. + get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name}) endif() if(CMAKE_VERSION VERSION_LESS 2.8.12) @@ -882,14 +885,11 @@ function(add_unittest test_suite test_name) set(LLVM_REQUIRES_RTTI OFF) + list(APPEND LLVM_LINK_COMPONENTS Support) # gtest needs it for raw_ostream add_llvm_executable(${test_name} IGNORE_EXTERNALIZE_DEBUGINFO ${ARGN}) set(outdir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}) set_output_directory(${test_name} BINARY_DIR ${outdir} LIBRARY_DIR ${outdir}) - target_link_libraries(${test_name} - gtest - gtest_main - LLVMSupport # gtest needs it for raw_ostream. - ) + target_link_libraries(${test_name} gtest_main gtest) add_dependencies(${test_suite} ${test_name}) get_target_property(test_suite_folder ${test_suite} FOLDER) diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake index aa68b40..725178a 100755 --- a/cmake/modules/LLVM-Config.cmake +++ b/cmake/modules/LLVM-Config.cmake @@ -40,10 +40,19 @@ macro(llvm_config executable) # done in case libLLVM does not contain all of the components # the target requires. # - # TODO strip LLVM_DYLIB_COMPONENTS out of link_components. + # Strip LLVM_DYLIB_COMPONENTS out of link_components. # To do this, we need special handling for "all", since that # may imply linking to libraries that are not included in # libLLVM. + + if (DEFINED link_components AND DEFINED LLVM_DYLIB_COMPONENTS) + if("${LLVM_DYLIB_COMPONENTS}" STREQUAL "all") + set(link_components "") + else() + list(REMOVE_ITEM link_components ${LLVM_DYLIB_COMPONENTS}) + endif() + endif() + target_link_libraries(${executable} LLVM) endif() diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index dccb7f4..7b284d5 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -5,11 +5,6 @@ LLVM 3.8 Release Notes .. contents:: :local: -.. warning:: - These are in-progress notes for the upcoming LLVM 3.8 release. You may - prefer the `LLVM 3.7 Release Notes `_. - Introduction ============ @@ -26,11 +21,6 @@ have questions or comments, the `LLVM Developer's Mailing List `_ is a good place to send them. -Note that if you are reading this file from a Subversion checkout or the main -LLVM web page, this document applies to the *next* release, not the current -one. To see the release notes for a specific release, please see the `releases -page `_. - Non-comprehensive list of changes in this release ================================================= * With this release, the minimum Windows version required for running LLVM is @@ -79,6 +69,26 @@ Non-comprehensive list of changes in this release * Support for dematerializing has been dropped. +* RegisterScheduler::setDefault was removed. Targets that used to call into the + command line parser to set the DAGScheduler, and that don't have enough + control with setSchedulingPreference, should look into overriding the + SubTargetHook "getDAGScheduler()". + +* ``ilist_iterator`` no longer has implicit conversions to and from ``T*``, + since ``ilist_iterator`` may be pointing at the sentinel (which is usually + not of type ``T`` at all). To convert from an iterator ``I`` to a pointer, + use ``&*I``; to convert from a pointer ``P`` to an iterator, use + ``P->getIterator()``. Alternatively, explicit conversions via + ``static_cast(U)`` are still available. + +* ``ilist_node::getNextNode()`` and ``ilist_node::getPrevNode()`` now + fail at compile time when the node cannot access its parent list. + Previously, when the sentinel was was an ``ilist_half_node``, this API + could return the sentinal instead of ``nullptr``. Frustrated callers should + be updated to use ``iplist::getNextNode(T*)`` instead. Alternatively, if + the node ``N`` is guaranteed not to be the last in the list, it is safe to + call ``&*++N->getIterator()`` directly. + .. NOTE For small 1-3 sentence descriptions, just add an entry at the end of this list. If your description won't fit comfortably in one bullet @@ -98,17 +108,97 @@ Non-comprehensive list of changes in this release Makes programs 10x faster by doing Special New Thing. -Changes to the ARM Backend --------------------------- - During this release ... +Changes to the ARM Backends +--------------------------- + +During this release the AArch64 target has: + +* Added support for more sanitizers (MSAN, TSAN) and made them compatible with + all VMA kernel configurations (kurrently tested on 39 and 42 bits). +* Gained initial LLD support in the new ELF back-end +* Extended the Load/Store optimiser and cleaned up some of the bad decisions + made earlier. +* Expanded LLDB support, including watchpoints, native building, Renderscript, + LLDB-server, debugging 32-bit applications. +* Added support for the ``Exynos M1`` chip. + +During this release the ARM target has: + +* Gained massive performance improvements on embedded benchmarks due to finally + running the stride vectorizer in full form, incrementing the performance gains + that we already had in the previous releases with limited stride vectorization. +* Expanded LLDB support, including watchpoints, unwind tables +* Extended the Load/Store optimiser and cleaned up some of the bad decisions + made earlier. +* Simplified code generation for global variable addresses in ELF, resulting in + a significant (4% in Chromium) reduction in code size. +* Gained some additional code size improvements, though there's still a long road + ahead, especially for older cores. +* Added some EABI floating point comparison functions to Compiler-RT +* Added support for Windows+GNU triple, +features in -mcpu/-march options. Changes to the MIPS Target -------------------------- - During this release ... - +During this release the MIPS target has: + +* Significantly extended support for the Integrated Assembler. See below for + more information +* Added support for the ``P5600`` processor. +* Added support for the ``interrupt`` attribute for MIPS32R2 and later. This + attribute will generate a function which can be used as a interrupt handler + on bare metal MIPS targets using the static relocation model. +* Added support for the ``ERETNC`` instruction found in MIPS32R5 and later. +* Added support for OpenCL. See http://portablecl.org/. + + * Address spaces 1 to 255 are now reserved for software use and conversions + between them are no-op casts. + +* Removed the ``mips16`` value for the -mcpu option since it is an :abbr:`ASE + (Application Specific Extension)` and not a processor. If you were using this, + please specify another CPU and use ``-mips16`` to enable MIPS16. +* Removed ``copy_u.w`` from 32-bit MSA and ``copy_u.d`` from 64-bit MSA since + they have been removed from the MSA specification due to forward compatibility + issues. For example, 32-bit MSA code containing ``copy_u.w`` would behave + differently on a 64-bit processor supporting MSA. The corresponding intrinsics + are still available and may expand to ``copy_s.[wd]`` where this is + appropriate for forward compatibility purposes. +* Relaxed the ``-mnan`` option to allow ``-mnan=2008`` on MIPS32R2/MIPS64R2 for + compatibility with GCC. +* Made MIPS64R6 the default CPU for 64-bit Android triples. + +The MIPS target has also fixed various bugs including the following notable +fixes: + +* Fixed reversed operands on ``mthi``/``mtlo`` in the DSP :abbr:`ASE + (Application Specific Extension)`. +* The code generator no longer uses ``jal`` for calls to absolute immediate + addresses. +* Disabled fast instruction selection on MIPS32R6 and MIPS64R6 since this is not + yet supported. +* Corrected addend for ``R_MIPS_HI16`` and ``R_MIPS_PCHI16`` in MCJIT +* The code generator no longer crashes when handling subregisters of an 64-bit + FPU register with undefined value. +* The code generator no longer attempts to use ``$zero`` for operands that do + not permit ``$zero``. +* Corrected the opcode used for ``ll``/``sc`` when using MIPS32R6/MIPS64R6 and + the Integrated Assembler. +* Added support for atomic load and atomic store. +* Corrected debug info when dynamically re-aligning the stack. + +Integrated Assembler +^^^^^^^^^^^^^^^^^^^^ +We have made a large number of improvements to the integrated assembler for +MIPS. In this release, the integrated assembler isn't quite production-ready +since there are a few known issues related to bare-metal support, checking +immediates on instructions, and the N32/N64 ABI's. However, the current support +should be sufficient for many users of the O32 ABI, particularly those targeting +MIPS32 on Linux or bare-metal MIPS32. + +If you would like to try the integrated assembler, please use +``-fintegrated-as``. Changes to the PowerPC Target ----------------------------- @@ -123,6 +213,20 @@ Changes to the X86 Target * TLS is enabled for Cygwin as emutls. +* Smaller code for materializing 32-bit 1 and -1 constants at ``-Os``. + +* More efficient code for wide integer compares. (E.g. 64-bit compares + on 32-bit targets.) + +* Tail call support for ``thiscall``, ``stdcall`, ``vectorcall``, and + ``fastcall`` functions. + +Changes to the AVR Target +------------------------- + +Slightly less than half of the AVR backend has been merged in at this point. It is still +missing a number large parts which cause it to be unusable, but is well on the +road to being completely merged and workable. Changes to the OCaml bindings ----------------------------- @@ -140,7 +244,19 @@ An exciting aspect of LLVM is that it is used as an enabling technology for a lot of other language and tools projects. This section lists some of the projects that have already been updated to work with LLVM 3.8. -* A project +LDC - the LLVM-based D compiler +------------------------------- + +`D `_ is a language with C-like syntax and static typing. It +pragmatically combines efficiency, control, and modeling power, with safety and +programmer productivity. D supports powerful concepts like Compile-Time Function +Execution (CTFE) and Template Meta-Programming, provides an innovative approach +to concurrency and offers many classical paradigms. + +`LDC `_ uses the frontend from the reference compiler +combined with LLVM as backend to produce efficient native code. LDC targets +x86/x86_64 systems like Linux, OS X and Windows and also PowerPC (32/64 bit) +and ARM. Ports to other architectures like AArch64 and MIPS64 are underway. Additional Information diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td index 06dfc32..5512b10 100644 --- a/include/llvm/IR/IntrinsicsPowerPC.td +++ b/include/llvm/IR/IntrinsicsPowerPC.td @@ -484,7 +484,7 @@ let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.". Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_ppc_altivec_vpkswss : GCCBuiltin<"__builtin_altivec_vpkswss">, - Intrinsic<[llvm_v16i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty], + Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_ppc_altivec_vpkswus : GCCBuiltin<"__builtin_altivec_vpkswus">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty], diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h index bb7ff27..8918dcd 100644 --- a/include/llvm/IR/Value.h +++ b/include/llvm/IR/Value.h @@ -280,11 +280,7 @@ public: // when using them since you might not get all uses. // The methods that don't start with materialized_ assert that modules is // fully materialized. -#ifdef NDEBUG - void assertModuleIsMaterialized() const {} -#else void assertModuleIsMaterialized() const; -#endif bool use_empty() const { assertModuleIsMaterialized(); diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp index 143d0b7..6f92ba6 100644 --- a/lib/Analysis/DemandedBits.cpp +++ b/lib/Analysis/DemandedBits.cpp @@ -242,13 +242,6 @@ void DemandedBits::determineLiveOperandBits( if (OperandNo != 0) AB = AOut; break; - case Instruction::ICmp: - // Count the number of leading zeroes in each operand. - ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1)); - auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(), - KnownZero2.countLeadingOnes()); - AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes); - break; } } diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index 4171657..5633aa4 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -555,6 +555,11 @@ bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, return true; O << -MO.getImm(); return false; + case 's': // The GCC deprecated s modifier + if (MO.getType() != MachineOperand::MO_Immediate) + return true; + O << ((32 - MO.getImm()) & 31); + return false; } } return true; diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index ae62b6b..f56c8e4 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -793,16 +793,27 @@ static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) { llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!"); } -/// Determine whether two variable pieces overlap. -static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) { - if (!P1->isBitPiece() || !P2->isBitPiece()) - return true; +// Determine the relative position of the pieces described by P1 and P2. +// Returns -1 if P1 is entirely before P2, 0 if P1 and P2 overlap, +// 1 if P1 is entirely after P2. +static int pieceCmp(const DIExpression *P1, const DIExpression *P2) { unsigned l1 = P1->getBitPieceOffset(); unsigned l2 = P2->getBitPieceOffset(); unsigned r1 = l1 + P1->getBitPieceSize(); unsigned r2 = l2 + P2->getBitPieceSize(); - // True where [l1,r1[ and [r1,r2[ overlap. - return (l1 < r2) && (l2 < r1); + if (r1 <= l2) + return -1; + else if (r2 <= l1) + return 1; + else + return 0; +} + +/// Determine whether two variable pieces overlap. +static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) { + if (!P1->isBitPiece() || !P2->isBitPiece()) + return true; + return pieceCmp(P1, P2) == 0; } /// \brief If this and Next are describing different pieces of the same @@ -811,14 +822,32 @@ static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) { /// Return true if the merge was successful. bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) { if (Begin == Next.Begin) { - auto *Expr = cast_or_null(Values[0].Expression); - auto *NextExpr = cast_or_null(Next.Values[0].Expression); - if (Expr->isBitPiece() && NextExpr->isBitPiece() && - !piecesOverlap(Expr, NextExpr)) { - addValues(Next.Values); - End = Next.End; - return true; + auto *FirstExpr = cast(Values[0].Expression); + auto *FirstNextExpr = cast(Next.Values[0].Expression); + if (!FirstExpr->isBitPiece() || !FirstNextExpr->isBitPiece()) + return false; + + // We can only merge entries if none of the pieces overlap any others. + // In doing so, we can take advantage of the fact that both lists are + // sorted. + for (unsigned i = 0, j = 0; i < Values.size(); ++i) { + for (; j < Next.Values.size(); ++j) { + int res = pieceCmp(cast(Values[i].Expression), + cast(Next.Values[j].Expression)); + if (res == 0) // The two expressions overlap, we can't merge. + return false; + // Values[i] is entirely before Next.Values[j], + // so go back to the next entry of Values. + else if (res == -1) + break; + // Next.Values[j] is entirely before Values[i], so go on to the + // next entry of Next.Values. + } } + + addValues(Next.Values); + End = Next.End; + return true; } return false; } diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp index eb9deb6..4d224a0 100644 --- a/lib/IR/Value.cpp +++ b/lib/IR/Value.cpp @@ -313,8 +313,8 @@ void Value::takeName(Value *V) { ST->reinsertValue(this); } -#ifndef NDEBUG void Value::assertModuleIsMaterialized() const { +#ifndef NDEBUG const GlobalValue *GV = dyn_cast(this); if (!GV) return; @@ -322,8 +322,10 @@ void Value::assertModuleIsMaterialized() const { if (!M) return; assert(M->isMaterialized()); +#endif } +#ifndef NDEBUG static bool contains(SmallPtrSetImpl &Cache, ConstantExpr *Expr, Constant *C) { if (!Cache.insert(Expr).second) diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 46ef2c1..cd3e84d 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -90,6 +90,7 @@ def AArch64InstrInfo : InstrInfo; include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" +include "AArch64SchedM1.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", @@ -144,8 +145,7 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; // FIXME: Cortex-A72 is currently modelled as an Cortex-A57. def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; -// FIXME: Exynos-M1 is currently modelled without a specific SchedModel. -def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>; +def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; //===----------------------------------------------------------------------===// // Assembly parser diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 9b73c5e..92cf1cd 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6689,6 +6689,9 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); } + if (LHS.getValueType().getVectorElementType() == MVT::f16) + return SDValue(); + assert(LHS.getValueType().getVectorElementType() == MVT::f32 || LHS.getValueType().getVectorElementType() == MVT::f64); diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td new file mode 100644 index 0000000..6525628 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedM1.td @@ -0,0 +1,359 @@ +//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Samsung Exynos-M1 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The Exynos-M1 is a traditional superscalar microprocessor with a +// 4-wide in-order stage for decode and dispatch and a wider issue stage. +// The execution units and loads and stores are out-of-order. + +def ExynosM1Model : SchedMachineModel { + let IssueWidth = 4; // Up to 4 uops per cycle. + let MinLatency = 0; // OoO. + let MicroOpBufferSize = 96; // ROB size. + let LoopMicroOpBufferSize = 32; // Instruction queue size. + let LoadLatency = 4; // Optimistic load cases. + let MispredictPenalty = 14; // Minimum branch misprediction penalty. + let CompleteModel = 0; // Use the default model otherwise. +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on the Exynos-M1, +// which has 9 pipelines, each with its own queue with out-of-order dispatch. + +def M1UnitA : ProcResource<2>; // Simple integer +def M1UnitC : ProcResource<1>; // Simple and complex integer +def M1UnitB : ProcResource<2>; // Branch +def M1UnitL : ProcResource<1>; // Load +def M1UnitS : ProcResource<1>; // Store +def M1PipeF0 : ProcResource<1>; // FP #0 +def M1PipeF1 : ProcResource<1>; // FP #1 + +let Super = M1PipeF0 in { + def M1UnitFMAC : ProcResource<1>; // FP multiplication + def M1UnitFCVT : ProcResource<1>; // FP conversion + def M1UnitNAL0 : ProcResource<1>; // Simple vector. + def M1UnitNMISC : ProcResource<1>; // Miscellanea + def M1UnitNCRYPT : ProcResource<1>; // Cryptographic +} + +let Super = M1PipeF1 in { + def M1UnitFADD : ProcResource<1>; // Simple FP + let BufferSize = 1 in + def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized) + def M1UnitNAL1 : ProcResource<1>; // Simple vector. + def M1UnitFST : ProcResource<1>; // FP store +} + +let SchedModel = ExynosM1Model in { + def M1UnitALU : ProcResGroup<[M1UnitA, + M1UnitC]>; // All simple integer. + def M1UnitNALU : ProcResGroup<[M1UnitNAL0, + M1UnitNAL1]>; // All simple vector. +} + +let SchedModel = ExynosM1Model in { + +//===----------------------------------------------------------------------===// +// Coarse scheduling model for the Exynos-M1. + +// Branch instructions. +// TODO: Non-conditional direct branches take zero cycles and units. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +// TODO: Branch and link is much different. + +// Arithmetic and logical integer instructions. +def : WriteRes { let Latency = 1; } +// TODO: Shift over 3 and some extensions take 2 cycles. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +// Move instructions. +def : WriteRes { let Latency = 1; } + +// Divide and multiply instructions. +// TODO: Division blocks the divider inside C. +def : WriteRes { let Latency = 13; } +def : WriteRes { let Latency = 21; } +// TODO: Long multiplication take 5 cycles and also the ALU. +// TODO: Multiplication with accumulation can be advanced. +def : WriteRes { let Latency = 3; } +// TODO: 64-bit multiplication has a throughput of 1/2. +def : WriteRes { let Latency = 4; } + +// Miscellaneous instructions. +def : WriteRes { let Latency = 2; } + +// TODO: The latency for the post or pre register is 1 cycle. +def : WriteRes { let Latency = 0; } + +// Load instructions. +def : WriteRes { let Latency = 4; } +// TODO: Extended address requires also the ALU. +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 4; } + +// Store instructions. +def : WriteRes { let Latency = 1; } +// TODO: Extended address requires also the ALU. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +// FP data instructions. +def : WriteRes { let Latency = 3; } +// TODO: FCCMP is much different. +def : WriteRes { let Latency = 4; } +// TODO: DP takes longer. +def : WriteRes { let Latency = 15; } +// TODO: MACC takes longer. +def : WriteRes { let Latency = 4; } + +// FP miscellaneous instructions. +// TODO: Conversion between register files is much different. +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 1; } +// TODO: Copy from FPR to GPR is much different. +def : WriteRes { let Latency = 4; } + +// FP load instructions. +// TODO: ASIMD loads are much different. +def : WriteRes { let Latency = 5; } + +// FP store instructions. +// TODO: ASIMD stores are much different. +def : WriteRes { let Latency = 1; } + +// ASIMD FP instructions. +// TODO: Other operations are much different. +def : WriteRes { let Latency = 3; } + +// Other miscellaneous instructions. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Fast forwarding. + +// TODO: Add FP register forwarding rules. + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +// Integer multiply-accumulate. +// TODO: The forwarding for WriteIM64 saves actually 3 cycles. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Finer scheduling model for the Exynos-M1. + +def M1WriteNEONA : SchedWriteRes<[M1UnitNALU, + M1UnitNALU, + M1UnitFADD]> { let Latency = 9; } +def M1WriteNEONB : SchedWriteRes<[M1UnitNALU, + M1UnitFST]> { let Latency = 5; } +def M1WriteNEONC : SchedWriteRes<[M1UnitNALU, + M1UnitFST]> { let Latency = 6; } +def M1WriteNEOND : SchedWriteRes<[M1UnitNALU, + M1UnitFST, + M1UnitL]> { let Latency = 10; } +def M1WriteNEONE : SchedWriteRes<[M1UnitFCVT, + M1UnitFST]> { let Latency = 8; } +def M1WriteNEONF : SchedWriteRes<[M1UnitFCVT, + M1UnitFST, + M1UnitL]> { let Latency = 13; } +def M1WriteNEONG : SchedWriteRes<[M1UnitNMISC, + M1UnitFST]> { let Latency = 6; } +def M1WriteNEONH : SchedWriteRes<[M1UnitNALU, + M1UnitFST]> { let Latency = 3; } +def M1WriteNEONI : SchedWriteRes<[M1UnitFST, + M1UnitL]> { let Latency = 9; } +def M1WriteALU1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; } +def M1WriteB : SchedWriteRes<[M1UnitB]> { let Latency = 1; } +// FIXME: This is the worst case, conditional branch and link. +def M1WriteBL : SchedWriteRes<[M1UnitB, + M1UnitALU]> { let Latency = 1; } +// FIXME: This is the worst case, when using LR. +def M1WriteBLR : SchedWriteRes<[M1UnitB, + M1UnitALU, + M1UnitALU]> { let Latency = 2; } +def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; } +def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } +def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; } +def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; } +def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; } +def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; } +def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; } +def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15; } +def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23; } +def M1WriteNALU1 : SchedWriteRes<[M1UnitNALU]> { let Latency = 1; } +def M1WriteNALU2 : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; } +def M1WriteNAL11 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 1; } +def M1WriteNAL12 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 2; } +def M1WriteNAL13 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 3; } +def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } +def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; } +def M1WriteNMISC1 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 1; } +def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; } +def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } +def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } +def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } +def M1WriteTB : SchedWriteRes<[M1UnitC, + M1UnitALU]> { let Latency = 2; } + +// Branch instructions +def : InstRW<[M1WriteB ], (instrs Bcc)>; +def : InstRW<[M1WriteBL], (instrs BL)>; +def : InstRW<[M1WriteBLR], (instrs BLR)>; +def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>; +def : InstRW<[M1WriteTB], (instregex "^TBN?Z[WX]")>; + +// Arithmetic and logical integer instructions. +def : InstRW<[M1WriteALU1], (instrs COPY)>; + +// Divide and multiply instructions. + +// Miscellaneous instructions. + +// Load instructions. + +// Store instructions. + +// FP data instructions. +def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>; +def : InstRW<[M1WriteFADD3], (instregex "^F(ADD|SUB)[DS]rr")>; +def : InstRW<[M1WriteNEONG], (instregex "^FCCMPE?[DS]rr")>; +def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>; +def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>; +def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>; +def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>; +def : InstRW<[M1WriteFMAC4], (instregex "^FN?MUL[DS]rr")>; +def : InstRW<[M1WriteFMAC5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; +def : InstRW<[M1WriteFCVT3], (instregex "^FRINT.+r")>; +def : InstRW<[M1WriteNEONH], (instregex "^FCSEL[DS]rrr")>; +def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>; +def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>; + +// FP miscellaneous instructions. +def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>; +def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>; +def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>; +def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>; +def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>; +def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>; + +// FP load instructions. + +// FP store instructions. + +// ASIMD instructions. +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>; +def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>; +def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>; +def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>; +def : InstRW<[M1WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>; +def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>; +def : InstRW<[M1WriteNALU1], (instregex "^CMTSTv")>; +def : InstRW<[M1WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>; +def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>; +def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>; +def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>; +def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>; +def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>; +def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>; +def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>; +def : InstRW<[M1WriteNALU1], (instregex "^[SU]?SH(L|LL|R)2?v")>; +def : InstRW<[M1WriteNALU1], (instregex "^S[LR]Iv")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU](Q|QR|R)SHLU?v")>; + +// ASIMD FP instructions. +def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>; +def : InstRW<[M1WriteNEONA], (instregex "^FADDP")>; +def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>; +def : InstRW<[M1WriteFCVT3], (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>; +def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>; +def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>; +def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>; +def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>; +def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>; +def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>; +def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v")>; +def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v")>; +def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>; + +// ASIMD miscellaneous instructions. +def : InstRW<[M1WriteNALU1], (instregex "^RBITv")>; +def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M1WriteNALU1], (instregex "^CPY")>; +def : InstRW<[M1WriteNEONB], (instregex "^DUPv.+gpr")>; +def : InstRW<[M1WriteNALU1], (instregex "^DUPv.+lane")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU]?Q?XTU?Nv")>; +def : InstRW<[M1WriteNEONC], (instregex "^INSv.+gpr")>; +def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev")>; +def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>; +def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)Sv")>; +def : InstRW<[M1WriteNALU1], (instregex "^REV(16|32|64)v")>; +def : InstRW<[M1WriteNAL11], (instregex "^TB[LX]v8i8One")>; +def : InstRW<[WriteSequence<[M1WriteNAL11], 2>], + (instregex "^TB[LX]v8i8Two")>; +def : InstRW<[WriteSequence<[M1WriteNAL11], 3>], + (instregex "^TB[LX]v8i8Three")>; +def : InstRW<[WriteSequence<[M1WriteNAL11], 4>], + (instregex "^TB[LX]v8i8Four")>; +def : InstRW<[M1WriteNAL12], (instregex "^TB[LX]v16i8One")>; +def : InstRW<[WriteSequence<[M1WriteNAL12], 2>], + (instregex "^TB[LX]v16i8Two")>; +def : InstRW<[WriteSequence<[M1WriteNAL12], 3>], + (instregex "^TB[LX]v16i8Three")>; +def : InstRW<[WriteSequence<[M1WriteNAL12], 4>], + (instregex "^TB[LX]v16i8Four")>; +def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>; +def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>; +def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>; +def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[M1WriteNALU1], (instregex "^ZIP(1|2)v")>; + +// ASIMD load instructions. + +// ASIMD store instructions. + +// Cryptography instructions. +def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>; +def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>; +def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>; +def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>; +def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>; +def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>; + +// CRC instructions. +def : InstRW<[M1WriteC2], (instregex "^CRC32")>; + +} // SchedModel = ExynosM1Model diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 79c6604..844d89c 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -183,6 +183,7 @@ def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>; def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>; def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>; def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>; +def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>; class SubtargetFeatureLocalMemorySize : SubtargetFeature< "localmemorysize"#Value, @@ -252,7 +253,7 @@ def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>; + FeatureGCN3Encoding, FeatureCIInsts]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 4796e9e..49c94f1 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -53,7 +53,8 @@ public: ISAVersion7_0_0, ISAVersion7_0_1, ISAVersion8_0_0, - ISAVersion8_0_1 + ISAVersion8_0_1, + ISAVersion8_0_3 }; private: diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td index a1584a2..4300d97 100644 --- a/lib/Target/AMDGPU/Processors.td +++ b/lib/Target/AMDGPU/Processors.td @@ -128,21 +128,23 @@ def : ProcessorModel<"mullins", SIQuarterSpeedModel, //===----------------------------------------------------------------------===// def : ProcessorModel<"tonga", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0] + [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0, + FeatureLDSBankCount32] >; def : ProcessorModel<"iceland", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0] + [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0, + FeatureLDSBankCount32] >; def : ProcessorModel<"carrizo", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1] + [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32] >; def : ProcessorModel<"fiji", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1] + [FeatureVolcanicIslands, FeatureISAVersion8_0_3, FeatureLDSBankCount32] >; def : ProcessorModel<"stoney", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1] + [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16] >; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 609f5e7..025ed2b 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -234,6 +234,7 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, bool IsLoad = TII->get(LoadStoreOp).mayLoad(); bool RanOutOfSGPRs = false; + bool Scavenged = false; unsigned SOffset = ScratchOffset; unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); @@ -244,6 +245,8 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, if (SOffset == AMDGPU::NoRegister) { RanOutOfSGPRs = true; SOffset = AMDGPU::SGPR0; + } else { + Scavenged = true; } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) .addReg(ScratchOffset) @@ -259,10 +262,14 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; + unsigned SOffsetRegState = 0; + if (i + 1 == e && Scavenged) + SOffsetRegState |= RegState::Kill; + BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) .addReg(ScratchRsrcReg) - .addReg(SOffset) + .addReg(SOffset, SOffsetRegState) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 3b4c235..1f5deae 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -41,6 +41,9 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { if (Features.test(FeatureISAVersion8_0_1)) return {8, 0, 1}; + if (Features.test(FeatureISAVersion8_0_3)) + return {8, 0, 3}; + return {0, 0, 0}; } diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index dfbb969..6e7edbf 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -747,7 +747,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, // If Offset is a multiply-by-constant and it's profitable to extract a shift // and use it in a shifted operand do so. - if (Offset.getOpcode() == ISD::MUL) { + if (Offset.getOpcode() == ISD::MUL && N.hasOneUse()) { unsigned PowerOfTwo = 0; SDValue NewMulConst; if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) { @@ -1422,7 +1422,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, // If OffReg is a multiply-by-constant and it's profitable to extract a shift // and use it in a shifted operand do so. - if (OffReg.getOpcode() == ISD::MUL) { + if (OffReg.getOpcode() == ISD::MUL && N.hasOneUse()) { unsigned PowerOfTwo = 0; SDValue NewMulConst; if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) { diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index b451ebf..16dcd46 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -1615,7 +1615,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) { // extension rather than sign extension. Make sure we pass the return // value extension property to integer materialization. unsigned SrcReg = - PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt); + PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() != CCValAssign::ZExt); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg); @@ -2091,25 +2091,21 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT, const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass); + int64_t Imm = UseSExt ? CI->getSExtValue() : CI->getZExtValue(); // If the constant is in range, use a load-immediate. - if (UseSExt && isInt<16>(CI->getSExtValue())) { + // Since LI will sign extend the constant we need to make sure that for + // our zeroext constants that the sign extended constant fits into 16-bits - + // a range of 0..0x7fff. + if (isInt<16>(Imm)) { unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; unsigned ImmReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) - .addImm(CI->getSExtValue()); - return ImmReg; - } else if (!UseSExt && isUInt<16>(CI->getZExtValue())) { - unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; - unsigned ImmReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) - .addImm(CI->getZExtValue()); + .addImm(Imm); return ImmReg; } // Construct the constant piecewise. - int64_t Imm = CI->getZExtValue(); - if (VT == MVT::i64) return PPCMaterialize64BitInt(Imm, RC); else if (VT == MVT::i32) diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index cb0271f..5367468 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -736,7 +736,7 @@ def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss, def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus, v16i8, v8i16>; def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss, - v16i8, v4i32>; + v8i16, v4i32>; def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus, v8i16, v4i32>; def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index ee73267..b0a6127 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1849,7 +1849,7 @@ static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask, if (CCMask == SystemZ::CCMASK_CMP_NE) return SystemZ::CCMASK_TM_SOME_1; } - if (EffectivelyUnsigned && CmpVal <= Low) { + if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) { if (CCMask == SystemZ::CCMASK_CMP_LT) return SystemZ::CCMASK_TM_ALL_0; if (CCMask == SystemZ::CCMASK_CMP_GE) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 34f3919..c12a3ed 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1335,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::SETCCE, MVT::i1, Custom); setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); @@ -14975,8 +14976,11 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { assert(Carry.getOpcode() != ISD::CARRY_FALSE); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); - return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), - DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); + if (Op.getSimpleValueType() == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + return SetCC; } // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. @@ -16315,6 +16319,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { + if (isAllOnesConstant(Mask)) + return DAG.getTargetConstant(1, dl, MaskVT); + if (X86::isZeroNode(Mask)) + return DAG.getTargetConstant(0, dl, MaskVT); + if (MaskVT.bitsGT(Mask.getSimpleValueType())) { // Mask should be extended Mask = DAG.getNode(ISD::ANY_EXTEND, dl, @@ -17203,26 +17212,14 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg; - ConstantSDNode *MaskC = dyn_cast(Mask); - if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else { - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - } + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; return DAG.getMergeValues(RetOps, dl); @@ -17230,7 +17227,8 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain) { + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = cast(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); @@ -17238,29 +17236,18 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg; - ConstantSDNode *MaskC = dyn_cast(Mask); - if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else { - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - } + SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); return SDValue(Res, 1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, - SDValue ScaleOp, SDValue Chain) { + SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = cast(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); @@ -17268,14 +17255,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg; - ConstantSDNode *MaskC = dyn_cast(Mask); - if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else - MaskInReg = DAG.getBitcast(MaskVT, Mask); + SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl); //SDVTList VTs = DAG.getVTList(MVT::Other); - SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); } @@ -17509,7 +17491,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, - Scale, Chain); + Scale, Chain, *Subtarget); } case PREFETCH: { SDValue Hint = Op.getOperand(6); @@ -17521,7 +17503,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(3); SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); - return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); + return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, + *Subtarget); } // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index c0786af..d9311a3 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3560,7 +3560,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { BO1->getOperand(0)); } - if (CI->isMaxValue(true)) { + if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) { ICmpInst::Predicate Pred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 47406b9..dd2889d 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -557,7 +557,8 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { ConstantInt::get(IdxType, i), }; auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName); - auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName); + auto *L = IC.Builder->CreateAlignedLoad(Ptr, LI.getAlignment(), + LoadName); V = IC.Builder->CreateInsertValue(V, L, i); } diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 5cde31a..bc4c0eb 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -380,6 +380,23 @@ static void replaceExtractElements(InsertElementInst *InsElt, ExtendMask.push_back(UndefValue::get(IntType)); Value *ExtVecOp = ExtElt->getVectorOperand(); + auto *ExtVecOpInst = dyn_cast(ExtVecOp); + BasicBlock *InsertionBlock = (ExtVecOpInst && !isa(ExtVecOpInst)) + ? ExtVecOpInst->getParent() + : ExtElt->getParent(); + + // TODO: This restriction matches the basic block check below when creating + // new extractelement instructions. If that limitation is removed, this one + // could also be removed. But for now, we just bail out to ensure that we + // will replace the extractelement instruction that is feeding our + // insertelement instruction. This allows the insertelement to then be + // replaced by a shufflevector. If the insertelement is not replaced, we can + // induce infinite looping because there's an optimization for extractelement + // that will delete our widening shuffle. This would trigger another attempt + // here to create that shuffle, and we spin forever. + if (InsertionBlock != InsElt->getParent()) + return; + auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), ConstantVector::get(ExtendMask)); @@ -387,7 +404,6 @@ static void replaceExtractElements(InsertElementInst *InsElt, // (as long as it's not a PHI) or at the start of the basic block of the // extract, so any subsequent extracts in the same basic block can use it. // TODO: Insert before the earliest ExtractElementInst that is replaced. - auto *ExtVecOpInst = dyn_cast(ExtVecOp); if (ExtVecOpInst && !isa(ExtVecOpInst)) WideVec->insertAfter(ExtVecOpInst); else diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 3125a2c..e484b69 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -90,6 +90,11 @@ static cl::opt SpeculateOneExpensiveInst( cl::desc("Allow exactly one expensive instruction to be speculatively " "executed")); +static cl::opt MaxSpeculationDepth( + "max-speculation-depth", cl::Hidden, cl::init(10), + cl::desc("Limit maximum recursion depth when calculating costs of " + "speculatively executed instructions")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); @@ -269,6 +274,13 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, unsigned &CostRemaining, const TargetTransformInfo &TTI, unsigned Depth = 0) { + // It is possible to hit a zero-cost cycle (phi/gep instructions for example), + // so limit the recursion depth. + // TODO: While this recursion limit does prevent pathological behavior, it + // would be better to track visited instructions to avoid cycles. + if (Depth == MaxSpeculationDepth) + return false; + Instruction *I = dyn_cast(V); if (!I) { // Non-instructions all dominate instructions, but not all constantexprs diff --git a/test/Analysis/DemandedBits/basic.ll b/test/Analysis/DemandedBits/basic.ll index 9973edf..3fd1b32 100644 --- a/test/Analysis/DemandedBits/basic.ll +++ b/test/Analysis/DemandedBits/basic.ll @@ -10,34 +10,3 @@ define i8 @test_mul(i32 %a, i32 %b) { %3 = trunc i32 %2 to i8 ret i8 %3 } - -; CHECK-LABEL: 'test_icmp1' -; CHECK-DAG: DemandedBits: 0x1 for %3 = icmp eq i32 %1, %2 -; CHECK-DAG: DemandedBits: 0xFFF for %1 = and i32 %a, 255 -; CHECK-DAG: DemandedBits: 0xFFF for %2 = shl i32 %1, 4 -define i1 @test_icmp1(i32 %a, i32 %b) { - %1 = and i32 %a, 255 - %2 = shl i32 %1, 4 - %3 = icmp eq i32 %1, %2 - ret i1 %3 -} - -; CHECK-LABEL: 'test_icmp2' -; CHECK-DAG: DemandedBits: 0x1 for %3 = icmp eq i32 %1, %2 -; CHECK-DAG: DemandedBits: 0xFFF for %1 = and i32 %a, 255 -; CHECK-DAG: DemandedBits: 0xFF for %2 = ashr i32 %1, 4 -define i1 @test_icmp2(i32 %a, i32 %b) { - %1 = and i32 %a, 255 - %2 = ashr i32 %1, 4 - %3 = icmp eq i32 %1, %2 - ret i1 %3 -} - -; CHECK-LABEL: 'test_icmp3' -; CHECK-DAG: DemandedBits: 0xFFFFFFFF for %1 = and i32 %a, 255 -; CHECK-DAG: DemandedBits: 0x1 for %2 = icmp eq i32 -1, %1 -define i1 @test_icmp3(i32 %a) { - %1 = and i32 %a, 255 - %2 = icmp eq i32 -1, %1 - ret i1 %2 -} diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll index f6e4bdf..b892f19 100644 --- a/test/CodeGen/AArch64/fp16-v4-instructions.ll +++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll @@ -267,4 +267,278 @@ define <4 x i16> @fptoui_i16(<4 x half> %a) #0 { ret <4 x i16> %1 } +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_une: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, ne +; CHECK-DAG: csel {{.*}}, wzr, ne +; CHECK-DAG: csel {{.*}}, wzr, ne +; CHECK-DAG: csel {{.*}}, wzr, ne +define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp une <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ueq: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, vs +; CHECK-DAG: csel {{.*}}, vs +; CHECK-DAG: csel {{.*}}, vs +; CHECK-DAG: csel {{.*}}, vs +define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ueq <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ugt: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, hi +; CHECK-DAG: csel {{.*}}, wzr, hi +; CHECK-DAG: csel {{.*}}, wzr, hi +; CHECK-DAG: csel {{.*}}, wzr, hi +define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ugt <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_uge: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, pl +; CHECK-DAG: csel {{.*}}, wzr, pl +; CHECK-DAG: csel {{.*}}, wzr, pl +; CHECK-DAG: csel {{.*}}, wzr, pl +define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp uge <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ult: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, lt +; CHECK-DAG: csel {{.*}}, wzr, lt +; CHECK-DAG: csel {{.*}}, wzr, lt +; CHECK-DAG: csel {{.*}}, wzr, lt +define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ult <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ule: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, le +; CHECK-DAG: csel {{.*}}, wzr, le +; CHECK-DAG: csel {{.*}}, wzr, le +; CHECK-DAG: csel {{.*}}, wzr, le +define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ule <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_uno: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, vs +; CHECK-DAG: csel {{.*}}, wzr, vs +; CHECK-DAG: csel {{.*}}, wzr, vs +; CHECK-DAG: csel {{.*}}, wzr, vs +define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp uno <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_one: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, gt +; CHECK-DAG: csel {{.*}}, gt +; CHECK-DAG: csel {{.*}}, gt +; CHECK-DAG: csel {{.*}}, gt +define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp one <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_oeq: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp oeq <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ogt: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, gt +; CHECK-DAG: csel {{.*}}, wzr, gt +; CHECK-DAG: csel {{.*}}, wzr, gt +; CHECK-DAG: csel {{.*}}, wzr, gt +define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ogt <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_oge: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, ge +; CHECK-DAG: csel {{.*}}, wzr, ge +; CHECK-DAG: csel {{.*}}, wzr, ge +; CHECK-DAG: csel {{.*}}, wzr, ge +define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp oge <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_olt: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +define <4 x i1> @test_fcmp_olt(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp olt <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ole: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, ls +; CHECK-DAG: csel {{.*}}, wzr, ls +; CHECK-DAG: csel {{.*}}, wzr, ls +; CHECK-DAG: csel {{.*}}, wzr, ls +define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ole <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ord: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, vc +; CHECK-DAG: csel {{.*}}, wzr, vc +; CHECK-DAG: csel {{.*}}, wzr, vc +; CHECK-DAG: csel {{.*}}, wzr, vc +define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ord <4 x half> %a, %b + ret <4 x i1> %1 +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll index 137d1f3..2f70f36 100644 --- a/test/CodeGen/AArch64/fp16-v8-instructions.ll +++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll @@ -421,4 +421,88 @@ define <8 x i16> @fptoui_i16(<8 x half> %a) #0 { ret <8 x i16> %1 } +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp une <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 16 csel tests. Skipped. +define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ueq <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ugt <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp uge <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ult <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ule <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp uno <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp one <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp oeq <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ogt <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp oge <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp olt <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ole <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ord <8 x half> %a, %b + ret <8 x i1> %1 +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll index 0e46622..f82e98e 100644 --- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll +++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll @@ -1,6 +1,8 @@ ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=HSA --check-prefix=HSA-FIJI %s ; HSA: .hsa_code_object_version 1,0 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" +; HSA-FIJI: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU" diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll index 3d05da6..fdc3240 100644 --- a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll +++ b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll @@ -1,5 +1,6 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s ;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s +;RUN: llc < %s -march=amdgcn -mcpu=stoney -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s ;GCN-LABEL: {{^}}main: diff --git a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll new file mode 100644 index 0000000..4a12ed5 --- /dev/null +++ b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; When the offset of VGPR spills into scratch space gets too large, an additional SGPR +; is used to calculate the scratch load/store address. Make sure that this +; mechanism works even when many spills happen. + +; Just test that it compiles successfully. +; CHECK-LABEL: test +define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in, + <96 x i32> addrspace(1)* %sdata_out, <96 x i32> %sdata_in) { +entry: + %tid = call i32 @llvm.SI.tid() nounwind readnone + + %aptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid + %a = load <1280 x i32>, <1280 x i32> addrspace(1)* %aptr + +; mark most VGPR registers as used to increase register pressure + call void asm sideeffect "", "~{VGPR4},~{VGPR8},~{VGPR12},~{VGPR16},~{VGPR20},~{VGPR24},~{VGPR28},~{VGPR32}" () + call void asm sideeffect "", "~{VGPR36},~{VGPR40},~{VGPR44},~{VGPR48},~{VGPR52},~{VGPR56},~{VGPR60},~{VGPR64}" () + call void asm sideeffect "", "~{VGPR68},~{VGPR72},~{VGPR76},~{VGPR80},~{VGPR84},~{VGPR88},~{VGPR92},~{VGPR96}" () + call void asm sideeffect "", "~{VGPR100},~{VGPR104},~{VGPR108},~{VGPR112},~{VGPR116},~{VGPR120},~{VGPR124},~{VGPR128}" () + call void asm sideeffect "", "~{VGPR132},~{VGPR136},~{VGPR140},~{VGPR144},~{VGPR148},~{VGPR152},~{VGPR156},~{VGPR160}" () + call void asm sideeffect "", "~{VGPR164},~{VGPR168},~{VGPR172},~{VGPR176},~{VGPR180},~{VGPR184},~{VGPR188},~{VGPR192}" () + call void asm sideeffect "", "~{VGPR196},~{VGPR200},~{VGPR204},~{VGPR208},~{VGPR212},~{VGPR216},~{VGPR220},~{VGPR224}" () + + %outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid + store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr + + ret void +} + +declare i32 @llvm.SI.tid() nounwind readnone diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll index 5d44eb0..e5f9b11 100644 --- a/test/CodeGen/ARM/shifter_operand.ll +++ b/test/CodeGen/ARM/shifter_operand.ll @@ -239,3 +239,20 @@ define void @test_well_formed_dag(i32 %in1, i32 %in2, i32* %addr) { store i32 %add, i32* %addr ret void } + +define { i32, i32 } @test_multi_use_add(i32 %base, i32 %offset) { +; CHECK-LABEL: test_multi_use_add: +; CHECK-THUMB: movs [[CONST:r[0-9]+]], #28 +; CHECK-THUMB: movt [[CONST]], #1 + + %prod = mul i32 %offset, 65564 + %sum = add i32 %base, %prod + + %ptr = inttoptr i32 %sum to i32* + %loaded = load i32, i32* %ptr + + %ret.tmp = insertvalue { i32, i32 } undef, i32 %sum, 0 + %ret = insertvalue { i32, i32 } %ret.tmp, i32 %loaded, 1 + + ret { i32, i32 } %ret +} diff --git a/test/CodeGen/PowerPC/fast-isel-ret.ll b/test/CodeGen/PowerPC/fast-isel-ret.ll index e05ef7d..0adb5a9 100644 --- a/test/CodeGen/PowerPC/fast-isel-ret.ll +++ b/test/CodeGen/PowerPC/fast-isel-ret.ll @@ -186,3 +186,12 @@ entry: ; ELF64: blr ret i32 -1 } + +define zeroext i16 @ret20() nounwind { +entry: +; ELF64-LABEL: ret20 +; ELF64: lis{{.*}}0 +; ELF64: ori{{.*}}32768 +; ELF64: blr + ret i16 32768 +} diff --git a/test/CodeGen/PowerPC/inline-asm-s-modifier.ll b/test/CodeGen/PowerPC/inline-asm-s-modifier.ll new file mode 100644 index 0000000..c8b00b6 --- /dev/null +++ b/test/CodeGen/PowerPC/inline-asm-s-modifier.ll @@ -0,0 +1,10 @@ +; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s +define void @test() { +entry: + call void asm sideeffect "mtfsb1 ${0:s}", "i"(i32 7), !srcloc !1 + ret void +} +; CHECK: #APP +; CHECK-NEXT: mtfsb1 25 + +!1 = !{i32 40} diff --git a/test/CodeGen/PowerPC/pr26193.ll b/test/CodeGen/PowerPC/pr26193.ll new file mode 100644 index 0000000..acd99bc --- /dev/null +++ b/test/CodeGen/PowerPC/pr26193.ll @@ -0,0 +1,9 @@ +; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s +define <8 x i16> @test(<4 x i32> %a) { +entry: + %0 = tail call <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32> %a, <4 x i32> %a) + ret <8 x i16> %0 +} +; CHECK: vpkswss 2, + +declare <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32>, <4 x i32>) diff --git a/test/CodeGen/PowerPC/pr26356.ll b/test/CodeGen/PowerPC/pr26356.ll new file mode 100644 index 0000000..0f5d877 --- /dev/null +++ b/test/CodeGen/PowerPC/pr26356.ll @@ -0,0 +1,136 @@ +; RUN: llc -O0 -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s + +define zeroext i32 @f1() { +entry: + ret i32 65535 +} +; CHECK-LABEL: @f1 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 65535 + +define zeroext i32 @f2() { +entry: + ret i32 32768 +} +; CHECK-LABEL: @f2 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 32768 + +define zeroext i32 @f3() { +entry: + ret i32 32767 +} +; CHECK-LABEL: @f3 +; CHECK: li 3, 32767 + +define zeroext i16 @f4() { +entry: + ret i16 65535 +} +; CHECK-LABEL: @f4 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 65535 + +define zeroext i16 @f5() { +entry: + ret i16 32768 +} +; CHECK-LABEL: @f5 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 32768 + +define zeroext i16 @f6() { +entry: + ret i16 32767 +} +; CHECK-LABEL: @f6 +; CHECK: li 3, 32767 + +define zeroext i16 @f7() { +entry: + ret i16 -1 +} +; CHECK-LABEL: @f7 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 65535 + +define zeroext i16 @f8() { +entry: + ret i16 -32768 +} +; CHECK-LABEL: @f8 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 32768 + +define signext i32 @f1s() { +entry: + ret i32 65535 +} +; CHECK-LABEL: @f1s +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 65535 + +define signext i32 @f2s() { +entry: + ret i32 32768 +} +; CHECK-LABEL: @f2s +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 32768 + +define signext i32 @f3s() { +entry: + ret i32 32767 +} +; CHECK-LABEL: @f3s +; CHECK: li 3, 32767 + +define signext i16 @f4s() { +entry: + ret i16 32767 +} +; CHECK-LABEL: @f4s +; CHECK: li 3, 32767 + +define signext i32 @f1sn() { +entry: + ret i32 -65535 +} +; CHECK-LABEL: @f1sn +; CHECK: lis 3, -1 +; CHECK: ori 3, 3, 1 + +define signext i32 @f2sn() { +entry: + ret i32 -32768 +} +; CHECK-LABEL: @f2sn +; CHECK: li 3, -32768 + +define signext i32 @f3sn() { +entry: + ret i32 -32767 +} +; CHECK-LABEL: @f3sn +; CHECK: li 3, -32767 + +define signext i32 @f4sn() { +entry: + ret i32 -65536 +} +; CHECK-LABEL: @f4sn +; CHECK: lis 3, -1 + +define signext i16 @f5sn() { +entry: + ret i16 -32767 +} +; CHECK-LABEL: @f5sn +; CHECK: li 3, -32767 + +define signext i16 @f6sn() { +entry: + ret i16 -32768 +} +; CHECK-LABEL: @f6sn +; CHECK: li 3, -32768 diff --git a/test/CodeGen/PowerPC/pr26381.ll b/test/CodeGen/PowerPC/pr26381.ll new file mode 100644 index 0000000..a45288e --- /dev/null +++ b/test/CodeGen/PowerPC/pr26381.ll @@ -0,0 +1,8 @@ +; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown -O0 < %s | FileCheck %s + +define internal signext i32 @foo() #0 { + ret i32 -125452974 +} + +; CHECK: lis 3, -1915 +; CHECK: ori 3, 3, 48466 diff --git a/test/CodeGen/SystemZ/int-cmp-53.ll b/test/CodeGen/SystemZ/int-cmp-53.ll new file mode 100644 index 0000000..b7d985e --- /dev/null +++ b/test/CodeGen/SystemZ/int-cmp-53.ll @@ -0,0 +1,26 @@ +; This used to incorrectly use a TMLL for an always-false test at -O0. +; +; RUN: llc -O0 < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define void @test(i8 *%input, i32 *%result) { +entry: +; CHECK-NOT: tmll + + %0 = load i8, i8* %input, align 1 + %1 = trunc i8 %0 to i1 + %2 = zext i1 %1 to i32 + %3 = icmp sge i32 %2, 0 + br i1 %3, label %if.then, label %if.else + +if.then: + store i32 1, i32* %result, align 4 + br label %return + +if.else: + store i32 0, i32* %result, align 4 + br label %return + +return: + ret void +} + diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 3bc67cc..9ba1819 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -259,18 +259,22 @@ define void @prefetch(<8 x i64> %ind, i8* %base) { ; CHECK: ## BB#0: ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: kmovb %eax, %k1 ; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: movb $120, %al +; CHECK-NEXT: kmovb %eax, %k1 ; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} ; CHECK-NEXT: retq call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0) - call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1) - call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0) - call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1) + call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 0) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 1) ret void } - declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32) define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { @@ -790,3 +794,54 @@ define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, < ret void } +define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) { +; CHECK-LABEL: scatter_mask_test: +; CHECK: ## BB#0: +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: kmovb %eax, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: movb $96, %al +; CHECK-NEXT: kmovb %eax, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4) + ret void +} + +define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) { +; CHECK-LABEL: gather_mask_test: +; CHECK: ## BB#0: +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm2 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: movw $1, %ax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} +; CHECK-NEXT: movw $220, %ax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) + %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4) + %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4) + + %res4 = fadd <16 x float> %res, %res1 + %res5 = fadd <16 x float> %res3, %res2 + %res6 = fadd <16 x float> %res5, %res4 + ret <16 x float> %res6 +} diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll index 77739e7..91b42bd 100644 --- a/test/CodeGen/X86/setcc-lowering.ll +++ b/test/CodeGen/X86/setcc-lowering.ll @@ -1,26 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX +; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s | FileCheck %s --check-prefix=KNL-32 + ; Verify that we don't crash during codegen due to a wrong lowering ; of a setcc node with illegal operand types and return type. define <8 x i16> @pr25080(<8 x i32> %a) { -; CHECK-LABEL: pr25080: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; CHECK-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 -; CHECK-NEXT: vpsraw $15, %xmm0, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; AVX-LABEL: pr25080: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %0 = trunc <8 x i32> %a to <8 x i23> %1 = icmp eq <8 x i23> %0, zeroinitializer @@ -28,3 +30,46 @@ entry: %3 = sext <8 x i1> %2 to <8 x i16> ret <8 x i16> %3 } + +define void @pr26232(i64 %a) { +; KNL-32-LABEL: pr26232: +; KNL-32: # BB#0: # %for_test11.preheader +; KNL-32-NEXT: pushl %esi +; KNL-32-NEXT: .Ltmp0: +; KNL-32-NEXT: .cfi_def_cfa_offset 8 +; KNL-32-NEXT: .Ltmp1: +; KNL-32-NEXT: .cfi_offset %esi, -8 +; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; KNL-32-NEXT: movw $-1, %dx +; KNL-32-NEXT: .align 16, 0x90 +; KNL-32-NEXT: .LBB1_1: # %for_loop599 +; KNL-32-NEXT: # =>This Inner Loop Header: Depth=1 +; KNL-32-NEXT: cmpl $65536, %ecx # imm = 0x10000 +; KNL-32-NEXT: movl %eax, %esi +; KNL-32-NEXT: sbbl $0, %esi +; KNL-32-NEXT: movl $0, %esi +; KNL-32-NEXT: cmovlw %dx, %si +; KNL-32-NEXT: testw %si, %si +; KNL-32-NEXT: jne .LBB1_1 +; KNL-32-NEXT: # BB#2: # %for_exit600 +; KNL-32-NEXT: popl %esi +; KNL-32-NEXT: retl +allocas: + br label %for_test11.preheader + +for_test11.preheader: ; preds = %for_test11.preheader, %allocas + br i1 undef, label %for_loop599, label %for_test11.preheader + +for_loop599: ; preds = %for_loop599, %for_test11.preheader + %less_i_load605_ = icmp slt i64 %a, 65536 + %less_i_load605__broadcast_init = insertelement <16 x i1> undef, i1 %less_i_load605_, i32 0 + %less_i_load605__broadcast = shufflevector <16 x i1> %less_i_load605__broadcast_init, <16 x i1> undef, <16 x i32> zeroinitializer + %"oldMask&test607" = and <16 x i1> %less_i_load605__broadcast, undef + %intmask.i894 = bitcast <16 x i1> %"oldMask&test607" to i16 + %res.i895 = icmp eq i16 %intmask.i894, 0 + br i1 %res.i895, label %for_exit600, label %for_loop599 + +for_exit600: ; preds = %for_loop599 + ret void +} diff --git a/test/DebugInfo/X86/PR26148.ll b/test/DebugInfo/X86/PR26148.ll new file mode 100644 index 0000000..b552508 --- /dev/null +++ b/test/DebugInfo/X86/PR26148.ll @@ -0,0 +1,102 @@ +; RUN: llc -filetype=obj -o - < %s | llvm-dwarfdump - | FileCheck %s +; +; Created using clang -g -O3 from: +; struct S0 { +; short f0; +; int f3; +; } a; +; void fn1(short p1) { +; struct S0 b, c = {3}; +; b.f3 = p1; +; a = b = c; +; } +; +; int main() { return 0; } +; +; This is similar to the bug in test/DebugInfo/ARM/PR26163.ll, except that there is an +; extra non-overlapping range first. Thus, we make sure that the backend actually looks +; at all expressions when determining whether to merge ranges, not just the first one. +; AS in 26163, we expect two ranges (as opposed to one), the first one being zero sized +; +; +; CHECK: 0x00000000: Beginning address offset: 0x0000000000000004 +; CHECK: Ending address offset: 0x0000000000000004 +; CHECK: Location description: 10 03 55 93 04 +; CHECK: Beginning address offset: 0x0000000000000004 +; CHECK: Ending address offset: 0x0000000000000014 +; CHECK: Location description: 10 03 10 00 + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.11.0" + +%struct.S0 = type { i16, i32 } + +@a = common global %struct.S0 zeroinitializer, align 4 + +declare void @llvm.dbg.declare(metadata, metadata, metadata) +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +; The attributes are here to force the zero-sized range not to be at the start of +; the function, which has special interpretation in DWARF. The fact that this happens +; at all is probably an LLVM bug. +attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } +define void @fn1(i16 signext %p1) #0 !dbg !4 { +entry: + tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !9, metadata !26), !dbg !27 + tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !10, metadata !26), !dbg !28 + tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !16, metadata !26), !dbg !29 + tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !16, metadata !30), !dbg !29 + tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !31), !dbg !29 + tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !10, metadata !32), !dbg !28 + tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !10, metadata !30), !dbg !28 + tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !10, metadata !31), !dbg !28 + store i32 3, i32* bitcast (%struct.S0* @a to i32*), align 4, !dbg !33 + store i32 0, i32* getelementptr inbounds (%struct.S0, %struct.S0* @a, i64 0, i32 1), align 4, !dbg !33 + ret void, !dbg !34 +} + +define i32 @main() !dbg !17 { +entry: + ret i32 0, !dbg !35 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!22, !23, !24} +!llvm.ident = !{!25} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3, globals: !20) +!1 = !DIFile(filename: "small.c", directory: "/Users/kfischer/Projects/clangbug") +!2 = !{} +!3 = !{!4, !17} +!4 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 5, type: !5, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, variables: !8) +!5 = !DISubroutineType(types: !6) +!6 = !{null, !7} +!7 = !DIBasicType(name: "short", size: 16, align: 16, encoding: DW_ATE_signed) +!8 = !{!9, !10, !16} +!9 = !DILocalVariable(name: "p1", arg: 1, scope: !4, file: !1, line: 5, type: !7) +!10 = !DILocalVariable(name: "b", scope: !4, file: !1, line: 6, type: !11) +!11 = !DICompositeType(tag: DW_TAG_structure_type, name: "S0", file: !1, line: 1, size: 64, align: 32, elements: !12) +!12 = !{!13, !14} +!13 = !DIDerivedType(tag: DW_TAG_member, name: "f0", scope: !11, file: !1, line: 2, baseType: !7, size: 16, align: 16) +!14 = !DIDerivedType(tag: DW_TAG_member, name: "f3", scope: !11, file: !1, line: 3, baseType: !15, size: 32, align: 32, offset: 32) +!15 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!16 = !DILocalVariable(name: "c", scope: !4, file: !1, line: 6, type: !11) +!17 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 11, type: !18, isLocal: false, isDefinition: true, scopeLine: 11, isOptimized: true, variables: !2) +!18 = !DISubroutineType(types: !19) +!19 = !{!15} +!20 = !{!21} +!21 = !DIGlobalVariable(name: "a", scope: !0, file: !1, line: 4, type: !11, isLocal: false, isDefinition: true, variable: %struct.S0* @a) +!22 = !{i32 2, !"Dwarf Version", i32 2} +!23 = !{i32 2, !"Debug Info Version", i32 3} +!24 = !{i32 1, !"PIC Level", i32 2} +!25 = !{!"clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)"} +!26 = !DIExpression() +!27 = !DILocation(line: 5, column: 16, scope: !4) +!28 = !DILocation(line: 6, column: 13, scope: !4) +!29 = !DILocation(line: 6, column: 16, scope: !4) +!30 = !DIExpression(DW_OP_bit_piece, 0, 32) +!31 = !DIExpression(DW_OP_bit_piece, 32, 32) +!32 = !DIExpression(DW_OP_bit_piece, 32, 16) +!33 = !DILocation(line: 8, column: 9, scope: !4) +!34 = !DILocation(line: 9, column: 1, scope: !4) +!35 = !DILocation(line: 11, column: 14, scope: !17) diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll index 7d6ec96..1e64cd7 100644 --- a/test/Transforms/InstCombine/icmp.ll +++ b/test/Transforms/InstCombine/icmp.ll @@ -1672,3 +1672,15 @@ define i1 @cmp_slt_rhs_inc(float %x, i32 %i) { %cmp = icmp slt i32 %conv, %inc ret i1 %cmp } + +; CHECK-LABEL: @PR26407 +; CHECK-NEXT: %[[addx:.*]] = add i32 %x, 2147483647 +; CHECK-NEXT: %[[addy:.*]] = add i32 %y, 2147483647 +; CHECK-NEXT: %[[cmp:.*]] = icmp uge i32 %[[addx]], %[[addy]] +; CHECK-NEXT: ret i1 %[[cmp]] +define i1 @PR26407(i32 %x, i32 %y) { + %addx = add i32 %x, 2147483647 + %addy = add i32 %y, 2147483647 + %cmp = icmp uge i32 %addx, %addy + ret i1 %cmp +} diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll index 47c2a13..8ed4db8 100644 --- a/test/Transforms/InstCombine/insert-extract-shuffle.ll +++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll @@ -175,3 +175,33 @@ bb3: ret <4 x double> %tmp4 } +; PR26354: https://llvm.org/bugs/show_bug.cgi?id=26354 +; Don't create a shufflevector if we know that we're not going to replace the insertelement. + +define double @pr26354(<2 x double>* %tmp, i1 %B) { +; CHECK-LABEL: @pr26354( +; CHECK: %ld = load <2 x double>, <2 x double>* %tmp +; CHECK-NEXT: %e1 = extractelement <2 x double> %ld, i32 0 +; CHECK-NEXT: br i1 %B, label %if, label %end +; CHECK: if: +; CHECK-NEXT: %e2 = extractelement <2 x double> %ld, i32 1 +; CHECK-NEXT: %i1 = insertelement <4 x double> +; CHECK-NEXT: br label %end + +entry: + %ld = load <2 x double>, <2 x double>* %tmp + %e1 = extractelement <2 x double> %ld, i32 0 + %e2 = extractelement <2 x double> %ld, i32 1 + br i1 %B, label %if, label %end + +if: + %i1 = insertelement <4 x double> zeroinitializer, double %e2, i32 3 + br label %end + +end: + %ph = phi <4 x double> [ undef, %entry ], [ %i1, %if ] + %e3 = extractelement <4 x double> %ph, i32 1 + %mu = fmul double %e1, %e3 + ret double %mu +} + diff --git a/test/Transforms/InstCombine/unpack-fca.ll b/test/Transforms/InstCombine/unpack-fca.ll index 9b8d104..4359839 100644 --- a/test/Transforms/InstCombine/unpack-fca.ll +++ b/test/Transforms/InstCombine/unpack-fca.ll @@ -136,3 +136,18 @@ define %B @structB(%B* %b.ptr) { %1 = load %B, %B* %b.ptr, align 8 ret %B %1 } + +%struct.S = type <{ i8, %struct.T }> +%struct.T = type { i32, i32 } + +; Make sure that we do not increase alignment of packed struct element +define i32 @packed_alignment(%struct.S* dereferenceable(9) %s) { +; CHECK-LABEL: packed_alignment +; CHECK-NEXT: %tv.elt1 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 1, i32 1 +; CHECK-NEXT: %tv.unpack2 = load i32, i32* %tv.elt1, align 1 +; CHECK-NEXT: ret i32 %tv.unpack2 + %t = getelementptr inbounds %struct.S, %struct.S* %s, i32 0, i32 1 + %tv = load %struct.T, %struct.T* %t, align 1 + %v = extractvalue %struct.T %tv, 1 + ret i32 %v +} diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index eee3104..51f899c 100644 --- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -205,39 +205,5 @@ for.body: ; preds = %for.body, %for.body br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; CHECK-LABEL: @add_g -; CHECK: load <16 x i8> -; CHECK: xor <16 x i8> -; CHECK: icmp ult <16 x i8> -; CHECK: select <16 x i1> {{.*}}, <16 x i8> -; CHECK: store <16 x i8> -define void @add_g(i8* noalias nocapture readonly %p, i8* noalias nocapture readonly %q, i8* noalias nocapture %r, i8 %arg1, i32 %len) #0 { - %1 = icmp sgt i32 %len, 0 - br i1 %1, label %.lr.ph, label %._crit_edge - -.lr.ph: ; preds = %0 - %2 = sext i8 %arg1 to i64 - br label %3 - -._crit_edge: ; preds = %3, %0 - ret void - -;