From d686ff024731a6edac15edc457c494395b960c32 Mon Sep 17 00:00:00 2001
From: dim <dim@FreeBSD.org>
Date: Sat, 16 Jan 2016 17:17:12 +0000
Subject: Vendor import of llvm release_38 branch r257836:
 https://llvm.org/svn/llvm-project/llvm/branches/release_38@257836

---
 CMakeLists.txt                                |  2 +-
 autoconf/configure.ac                         |  4 +-
 configure                                     | 20 +++++-----
 docs/ReleaseNotes.rst                         | 11 ++++++
 include/llvm/CodeGen/MachineFrameInfo.h       | 14 +++++++
 include/llvm/LinkAllPasses.h                  | 14 ++++---
 include/llvm/Target/TargetLowering.h          |  2 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  2 +-
 lib/Target/AMDGPU/AMDGPU.h                    |  2 +-
 lib/Target/X86/X86FrameLowering.cpp           | 13 ++++---
 lib/Target/X86/X86ISelLowering.cpp            |  2 +-
 test/CodeGen/X86/x86-repmov-copy-eflags.ll    | 53 +++++++++++++++++++++++++++
 utils/release/test-release.sh                 |  6 +++
 13 files changed, 116 insertions(+), 29 deletions(-)
 create mode 100644 test/CodeGen/X86/x86-repmov-copy-eflags.ll

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d2093f..4dd43e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,7 +36,7 @@ if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX svn)
+  set(LLVM_VERSION_SUFFIX "")
 endif()
 
 if (POLICY CMP0048)
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 02ab161e..8d0ae00 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -32,12 +32,12 @@ dnl===-----------------------------------------------------------------------===
 dnl Initialize autoconf and define the package name, version number and
 dnl address for reporting bugs.
 
-AC_INIT([LLVM],[3.8.0svn],[http://llvm.org/bugs/])
+AC_INIT([LLVM],[3.8.0],[http://llvm.org/bugs/])
 
 LLVM_VERSION_MAJOR=3
 LLVM_VERSION_MINOR=8
 LLVM_VERSION_PATCH=0
-LLVM_VERSION_SUFFIX=svn
+LLVM_VERSION_SUFFIX=
 
 AC_DEFINE_UNQUOTED([LLVM_VERSION_MAJOR], $LLVM_VERSION_MAJOR, [Major version of the LLVM API])
 AC_DEFINE_UNQUOTED([LLVM_VERSION_MINOR], $LLVM_VERSION_MINOR, [Minor version of the LLVM API])
diff --git a/configure b/configure
index 33438c6..c94fb13 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.60 for LLVM 3.8.0svn.
+# Generated by GNU Autoconf 2.60 for LLVM 3.8.0.
 #
 # Report bugs to <http://llvm.org/bugs/>.
 #
@@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='LLVM'
 PACKAGE_TARNAME='llvm'
-PACKAGE_VERSION='3.8.0svn'
-PACKAGE_STRING='LLVM 3.8.0svn'
+PACKAGE_VERSION='3.8.0'
+PACKAGE_STRING='LLVM 3.8.0'
 PACKAGE_BUGREPORT='http://llvm.org/bugs/'
 
 ac_unique_file="lib/IR/Module.cpp"
@@ -1334,7 +1334,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures LLVM 3.8.0svn to adapt to many kinds of systems.
+\`configure' configures LLVM 3.8.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1400,7 +1400,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of LLVM 3.8.0svn:";;
+     short | recursive ) echo "Configuration of LLVM 3.8.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1584,7 +1584,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-LLVM configure 3.8.0svn
+LLVM configure 3.8.0
 generated by GNU Autoconf 2.60
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1600,7 +1600,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by LLVM $as_me 3.8.0svn, which was
+It was created by LLVM $as_me 3.8.0, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   $ $0 $@
@@ -1957,7 +1957,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 LLVM_VERSION_MAJOR=3
 LLVM_VERSION_MINOR=8
 LLVM_VERSION_PATCH=0
-LLVM_VERSION_SUFFIX=svn
+LLVM_VERSION_SUFFIX=
 
 
 cat >>confdefs.h <<_ACEOF
@@ -18279,7 +18279,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by LLVM $as_me 3.8.0svn, which was
+This file was extended by LLVM $as_me 3.8.0, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -18332,7 +18332,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-LLVM config.status 3.8.0svn
+LLVM config.status 3.8.0
 configured by $0, generated by GNU Autoconf 2.60,
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index b3f7c00..dccb7f4 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -68,6 +68,17 @@ Non-comprehensive list of changes in this release
   Core.h so nothing should change for projects directly including the headers,
   but transitive dependencies may be affected.
 
+* llvm-ar now suports thin archives.
+
+* llvm doesn't produce .data.rel.ro.local or .data.rel sections anymore.
+
+* aliases to available_externally globals are now rejected by the verifier.
+
+* the IR Linker has been split into IRMover that moves bits from one module to
+  another and Linker proper that decides what to link.
+
+* Support for dematerializing has been dropped.
+
 .. NOTE
    For small 1-3 sentence descriptions, just add an entry at the end of
    this list. If your description won't fit comfortably in one bullet
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 48e8ca7..e50779a 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -251,6 +251,10 @@ class MachineFrameInfo {
   /// opaque mechanism like inline assembly or Win32 EH.
   bool HasOpaqueSPAdjustment;
 
+  /// True if the function contains operations which will lower down to
+  /// instructions which manipulate the stack pointer.
+  bool HasCopyImplyingStackAdjustment;
+
   /// True if the function contains a call to the llvm.vastart intrinsic.
   bool HasVAStart;
 
@@ -288,6 +292,7 @@ public:
     LocalFrameMaxAlign = 0;
     UseLocalStackAllocationBlock = false;
     HasOpaqueSPAdjustment = false;
+    HasCopyImplyingStackAdjustment = false;
     HasVAStart = false;
     HasMustTailInVarArgFunc = false;
     Save = nullptr;
@@ -493,6 +498,15 @@ public:
   bool hasOpaqueSPAdjustment() const { return HasOpaqueSPAdjustment; }
   void setHasOpaqueSPAdjustment(bool B) { HasOpaqueSPAdjustment = B; }
 
+  /// Returns true if the function contains operations which will lower down to
+  /// instructions which manipulate the stack pointer.
+  bool hasCopyImplyingStackAdjustment() const {
+    return HasCopyImplyingStackAdjustment;
+  }
+  void setHasCopyImplyingStackAdjustment(bool B) {
+    HasCopyImplyingStackAdjustment = B;
+  }
+
   /// Returns true if the function calls the llvm.va_start intrinsic.
   bool hasVAStart() const { return HasVAStart; }
   void setHasVAStart(bool B) { HasVAStart = B; }
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index d695d11..327faac 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -160,9 +160,11 @@ namespace {
       (void) llvm::createPostOrderFunctionAttrsPass();
       (void) llvm::createReversePostOrderFunctionAttrsPass();
       (void) llvm::createMergeFunctionsPass();
-      (void) llvm::createPrintModulePass(*(llvm::raw_ostream*)nullptr);
-      (void) llvm::createPrintFunctionPass(*(llvm::raw_ostream*)nullptr);
-      (void) llvm::createPrintBasicBlockPass(*(llvm::raw_ostream*)nullptr);
+      std::string buf;
+      llvm::raw_string_ostream os(buf);
+      (void) llvm::createPrintModulePass(os);
+      (void) llvm::createPrintFunctionPass(os);
+      (void) llvm::createPrintBasicBlockPass(os);
       (void) llvm::createModuleDebugInfoPrinterPass();
       (void) llvm::createPartialInliningPass();
       (void) llvm::createLintPass();
@@ -186,10 +188,10 @@ namespace {
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::ScalarEvolutionWrapperPass();
-      ((llvm::Function*)nullptr)->viewCFGOnly();
+      llvm::Function::Create(nullptr, llvm::GlobalValue::ExternalLinkage)->viewCFGOnly();
       llvm::RGPassManager RGM;
-      ((llvm::RegionPass*)nullptr)->runOnRegion((llvm::Region*)nullptr, RGM);
-      llvm::AliasSetTracker X(*(llvm::AliasAnalysis*)nullptr);
+      llvm::AliasAnalysis AA;
+      llvm::AliasSetTracker X(AA);
       X.add(nullptr, 0, llvm::AAMDNodes()); // for -print-alias-sets
       (void) llvm::AreStatisticsEnabled();
       (void) llvm::sys::RunningOnValgrind();
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 863b7cd..304da4f 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -2270,7 +2270,7 @@ public:
   }
 
   /// Return true if the MachineFunction contains a COPY which would imply
-  /// HasOpaqueSPAdjustment.
+  /// HasCopyImplyingStackAdjustment.
   virtual bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const {
     return false;
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 9f8759d..c075da4 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -634,7 +634,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   }
 
   if (TLI->hasCopyImplyingStackAdjustment(MF))
-    MFI->setHasOpaqueSPAdjustment(true);
+    MFI->setHasCopyImplyingStackAdjustment(true);
 
   // Freeze the set of reserved registers now that MachineFrameInfo has been
   // set up. All the information required by getReservedRegs() should be
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 5d00e1c..4f718e1 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -20,7 +20,7 @@ class AMDGPUInstrPrinter;
 class AMDGPUSubtarget;
 class AMDGPUTargetMachine;
 class FunctionPass;
-class MachineSchedContext;
+struct MachineSchedContext;
 class MCAsmInfo;
 class raw_ostream;
 class ScheduleDAGInstrs;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 8b5fd27..8632bb8 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -91,7 +91,8 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
           MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
           MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() ||
-          MFI->hasStackMap() || MFI->hasPatchPoint());
+          MFI->hasStackMap() || MFI->hasPatchPoint() ||
+          MFI->hasCopyImplyingStackAdjustment());
 }
 
 static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
@@ -943,11 +944,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // push and pop from the stack.
   if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
       !TRI->needsStackRealignment(MF) &&
-      !MFI->hasVarSizedObjects() &&    // No dynamic alloca.
-      !MFI->adjustsStack() &&          // No calls.
-      !IsWin64CC &&                    // Win64 has no Red Zone
-      !MFI->hasOpaqueSPAdjustment() && // Don't push and pop.
-      !MF.shouldSplitStack()) {        // Regular stack
+      !MFI->hasVarSizedObjects() &&             // No dynamic alloca.
+      !MFI->adjustsStack() &&                   // No calls.
+      !IsWin64CC &&                             // Win64 has no Red Zone
+      !MFI->hasCopyImplyingStackAdjustment() && // Don't push and pop.
+      !MF.shouldSplitStack()) {                 // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1ec93b5..b723059 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17458,7 +17458,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
       // We need a frame pointer because this will get lowered to a PUSH/POP
       // sequence.
       MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-      MFI->setHasOpaqueSPAdjustment(true);
+      MFI->setHasCopyImplyingStackAdjustment(true);
       // Don't do anything here, we will expand these intrinsics out later
       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
       return SDValue();
diff --git a/test/CodeGen/X86/x86-repmov-copy-eflags.ll b/test/CodeGen/X86/x86-repmov-copy-eflags.ll
new file mode 100644
index 0000000..ad39888
--- /dev/null
+++ b/test/CodeGen/X86/x86-repmov-copy-eflags.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+%struct.T = type { i64, [3 x i32] }
+
+; Function Attrs: nounwind optsize
+define void @f(i8* %p, i8* %q, i32* inalloca nocapture %unused) #0 {
+entry:
+  %g = alloca %struct.T, align 8
+  %r = alloca i32, align 8
+  store i32 0, i32* %r, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 24, i32 8, i1 false)
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %load = load i32, i32* %r, align 4
+  %dec = add nsw i32 %load, -1
+  store i32 %dec, i32* %r, align 4
+  call void @g(%struct.T* %g)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+
+declare void @g(%struct.T*)
+
+; CHECK-LABEL: _f:
+; CHECK:     pushl %ebp
+; CHECK:     movl %esp, %ebp
+; CHECK:     andl $-8, %esp
+; CHECK-NOT: movl %esp, %esi
+; CHECK:     rep;movsl
+; CHECK:     leal 8(%esp), %esi
+
+; CHECK:     decl     (%esp)
+; CHECK:     seto     %al
+; CHECK:     lahf
+; CHECK:     movl     %eax, %edi
+; CHECK:     pushl     %esi
+; CHECK:     calll     _g
+; CHECK:     addl     $4, %esp
+; CHECK:     movl     %edi, %eax
+; CHECK:     addb     $127, %al
+; CHECK:     sahf
+
+attributes #0 = { nounwind optsize }
+attributes #1 = { argmemonly nounwind }
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index bb1f786..fb50160 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -159,6 +159,12 @@ while [ $# -gt 0 ]; do
     shift
 done
 
+if [ "$use_autoconf" = "no" ]; then
+  # See llvm.org/PR26146.
+  echo Skipping test-suite when using CMake.
+  do_test_suite="no"
+fi
+
 # Check required arguments.
 if [ -z "$Release" ]; then
     echo "error: no release number specified"
-- 
cgit v1.1


From 21029d6a214a88783711894533b519ce0e65cc90 Mon Sep 17 00:00:00 2001
From: dim <dim@FreeBSD.org>
Date: Fri, 22 Jan 2016 21:16:09 +0000
Subject: Vendor import of llvm release_38 branch r258549:
 https://llvm.org/svn/llvm-project/llvm/branches/release_38@258549

---
 include/llvm/CodeGen/MachineFunction.h             |   2 +-
 include/llvm/CodeGen/SelectionDAGNodes.h           |  15 +
 include/llvm/IR/GlobalValue.h                      |   4 +
 include/llvm/Transforms/Utils/Local.h              |  19 +
 include/llvm/Transforms/Utils/SimplifyLibCalls.h   |   2 -
 lib/CodeGen/AsmPrinter/DebugLocEntry.h             |  13 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp              |  18 +
 lib/CodeGen/CodeGenPrepare.cpp                     |  35 +-
 lib/CodeGen/MachineFunction.cpp                    |   2 +-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp          |  44 +-
 lib/IR/Globals.cpp                                 |  44 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp         |   9 +-
 .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp    |  14 +-
 lib/Target/ARM/ARMISelLowering.cpp                 |   9 +-
 lib/Target/X86/X86CallingConv.td                   |   4 +-
 lib/Target/X86/X86FrameLowering.cpp                |  18 +-
 lib/Target/X86/X86ISelLowering.cpp                 |   9 +-
 lib/Transforms/InstCombine/InstCombineAndOrXor.cpp | 187 +--------
 lib/Transforms/Utils/InlineFunction.cpp            | 331 ++++++++++++++-
 lib/Transforms/Utils/Local.cpp                     | 235 ++++++++++-
 lib/Transforms/Utils/SimplifyLibCalls.cpp          |  55 ++-
 test/CodeGen/AArch64/cxx-tlscc.ll                  |  27 ++
 test/CodeGen/ARM/cse-flags.ll                      |  43 ++
 test/CodeGen/ARM/cxx-tlscc.ll                      |  11 +
 test/CodeGen/ARM/memfunc.ll                        |  18 +-
 test/CodeGen/X86/2014-05-30-CombineAddNSW.ll       |  20 -
 test/CodeGen/X86/cxx_tlscc64.ll                    |  27 ++
 test/CodeGen/X86/x86-shrink-wrap-unwind.ll         |  83 +++-
 test/DebugInfo/ARM/PR26163.ll                      | 107 +++++
 .../ExecutionEngine/MCJIT/remote/cross-module-a.ll |   2 +-
 .../ExecutionEngine/MCJIT/remote/multi-module-a.ll |   2 +-
 .../MCJIT/remote/simpletest-remote.ll              |   2 +-
 test/ExecutionEngine/MCJIT/remote/stubs-remote.ll  |   2 +-
 .../MCJIT/remote/test-common-symbols-remote.ll     |   2 +-
 .../MCJIT/remote/test-data-align-remote.ll         |   2 +-
 .../remote/test-fp-no-external-funcs-remote.ll     |   2 +-
 .../remote/test-global-init-nonzero-remote.ll      |   2 +-
 .../remote/test-global-init-nonzero-sm-pic.ll      |   2 +-
 .../MCJIT/remote/test-ptr-reloc-remote.ll          |   2 +-
 .../MCJIT/remote/test-ptr-reloc-sm-pic.ll          |   2 +-
 .../OrcMCJIT/remote/cross-module-a.ll              |   2 +-
 .../OrcMCJIT/remote/multi-module-a.ll              |   2 +-
 .../OrcMCJIT/remote/simpletest-remote.ll           |   2 +-
 .../OrcMCJIT/remote/stubs-remote.ll                |   2 +-
 .../OrcMCJIT/remote/test-common-symbols-remote.ll  |   2 +-
 .../OrcMCJIT/remote/test-data-align-remote.ll      |   2 +-
 .../remote/test-fp-no-external-funcs-remote.ll     |   2 +-
 .../remote/test-global-init-nonzero-remote.ll      |   2 +-
 .../remote/test-global-init-nonzero-sm-pic.ll      |   2 +-
 .../OrcMCJIT/remote/test-ptr-reloc-remote.ll       |   2 +-
 .../OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll       |   2 +-
 test/MC/AArch64/inst-directive.s                   |  15 +-
 .../CodeGenPrepare/ARM/bitreverse-recognize.ll     |  37 ++
 test/Transforms/CodeGenPrepare/ARM/lit.local.cfg   |   3 +
 test/Transforms/CodeGenPrepare/bitreverse-hang.ll  |  53 +++
 test/Transforms/Inline/inline-funclets.ll          | 455 +++++++++++++++++++++
 test/Transforms/InstCombine/bitreverse-hang.ll     |  53 +++
 .../Transforms/InstCombine/bitreverse-recognize.ll | 114 ------
 test/Transforms/InstCombine/cos-2.ll               |  16 +-
 .../InstCombine/double-float-shrink-1.ll           |  20 +
 tools/lli/lli.cpp                                  |   5 +-
 utils/release/test-release.sh                      |  21 +-
 62 files changed, 1772 insertions(+), 469 deletions(-)
 create mode 100644 test/CodeGen/ARM/cse-flags.ll
 delete mode 100644 test/CodeGen/X86/2014-05-30-CombineAddNSW.ll
 create mode 100644 test/DebugInfo/ARM/PR26163.ll
 create mode 100644 test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
 create mode 100644 test/Transforms/CodeGenPrepare/ARM/lit.local.cfg
 create mode 100644 test/Transforms/CodeGenPrepare/bitreverse-hang.ll
 create mode 100644 test/Transforms/Inline/inline-funclets.ll
 create mode 100644 test/Transforms/InstCombine/bitreverse-hang.ll
 delete mode 100644 test/Transforms/InstCombine/bitreverse-recognize.ll

diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 82c30d3..df7c951 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -295,7 +295,7 @@ public:
   }
 
   /// Should we be emitting segmented stack stuff for the function
-  bool shouldSplitStack();
+  bool shouldSplitStack() const;
 
   /// getNumBlockIDs - Return the number of MBB ID's allocated.
   ///
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 23816bd..536fc65 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -369,6 +369,18 @@ public:
     (UnsafeAlgebra << 3) | (NoNaNs << 4) | (NoInfs << 5) |
     (NoSignedZeros << 6) | (AllowReciprocal << 7);
   }
+
+  /// Clear any flags in this flag set that aren't also set in Flags.
+  void intersectWith(const SDNodeFlags *Flags) {
+    NoUnsignedWrap &= Flags->NoUnsignedWrap;
+    NoSignedWrap &= Flags->NoSignedWrap;
+    Exact &= Flags->Exact;
+    UnsafeAlgebra &= Flags->UnsafeAlgebra;
+    NoNaNs &= Flags->NoNaNs;
+    NoInfs &= Flags->NoInfs;
+    NoSignedZeros &= Flags->NoSignedZeros;
+    AllowReciprocal &= Flags->AllowReciprocal;
+  }
 };
 
 /// Represents one node in the SelectionDAG.
@@ -682,6 +694,9 @@ public:
   /// and directly, but it is not to avoid creating a vtable for this class.
   const SDNodeFlags *getFlags() const;
 
+  /// Clear any flags in this node that aren't also set in Flags.
+  void intersectFlagsWith(const SDNodeFlags *Flags);
+
   /// Return the number of values defined/returned by this operator.
   unsigned getNumValues() const { return NumValues; }
 
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 4fa4e7d..fa6469a 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -346,6 +346,10 @@ public:
     return !(isDeclarationForLinker() || isWeakForLinker());
   }
 
+  // Returns true if the alignment of the value can be unilaterally
+  // increased.
+  bool canIncreaseAlignment() const;
+
   /// This method unlinks 'this' from the containing module, but does not delete
   /// it.
   virtual void removeFromParent() = 0;
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 911c6f1..3ae0165 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -331,6 +331,25 @@ unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT,
 /// during lowering by the GC infrastructure.
 bool callsGCLeafFunction(ImmutableCallSite CS);
 
+//===----------------------------------------------------------------------===//
+//  Intrinsic pattern matching
+//
+
+/// Try and match a bitreverse or bswap idiom.
+///
+/// If an idiom is matched, an intrinsic call is inserted before \c I. Any added
+/// instructions are returned in \c InsertedInsts. They will all have been added
+/// to a basic block.
+///
+/// A bitreverse idiom normally requires around 2*BW nodes to be searched (where
+/// BW is the bitwidth of the integer type). A bswap idiom requires anywhere up
+/// to BW / 4 nodes to be searched, so is significantly faster.
+///
+/// This function returns true on a successful match or false otherwise.
+bool recognizeBitReverseOrBSwapIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 410a075..fc34f49 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -125,8 +125,6 @@ private:
   Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B);
 
   // Math Library Optimizations
-  Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, bool CheckRetType);
-  Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B);
   Value *optimizeCos(CallInst *CI, IRBuilder<> &B);
   Value *optimizePow(CallInst *CI, IRBuilder<> &B);
   Value *optimizeExp2(CallInst *CI, IRBuilder<> &B);
diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index bbe5324..b60ab91 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -93,18 +93,7 @@ public:
   /// variable, merge them by appending Next's values to the current
   /// list of values.
   /// Return true if the merge was successful.
-  bool MergeValues(const DebugLocEntry &Next) {
-    if (Begin == Next.Begin) {
-      auto *Expr = cast_or_null<DIExpression>(Values[0].Expression);
-      auto *NextExpr = cast_or_null<DIExpression>(Next.Values[0].Expression);
-      if (Expr->isBitPiece() && NextExpr->isBitPiece()) {
-        addValues(Next.Values);
-        End = Next.End;
-        return true;
-      }
-    }
-    return false;
-  }
+  bool MergeValues(const DebugLocEntry &Next);
 
   /// \brief Attempt to merge this DebugLocEntry with Next and return
   /// true if the merge was successful. Entries can be merged if they
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index a4fb07e..ae62b6b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -805,6 +805,24 @@ static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
   return (l1 < r2) && (l2 < r1);
 }
 
+/// \brief If this and Next are describing different pieces of the same
+/// variable, merge them by appending Next's values to the current
+/// list of values.
+/// Return true if the merge was successful.
+bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) {
+  if (Begin == Next.Begin) {
+    auto *Expr = cast_or_null<DIExpression>(Values[0].Expression);
+    auto *NextExpr = cast_or_null<DIExpression>(Next.Values[0].Expression);
+    if (Expr->isBitPiece() && NextExpr->isBitPiece() &&
+        !piecesOverlap(Expr, NextExpr)) {
+      addValues(Next.Values);
+      End = Next.End;
+      return true;
+    }
+  }
+  return false;
+}
+
 /// Build the location list for all DBG_VALUEs in the function that
 /// describe the same variable.  If the ranges of several independent
 /// pieces of the same variable overlap partially, split them up and
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 03e5778..c8007a5 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -1742,8 +1742,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       // over-aligning global variables that have an explicit section is
       // forbidden.
       GlobalVariable *GV;
-      if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->hasUniqueInitializer() &&
-          !GV->hasSection() && GV->getAlignment() < PrefAlign &&
+      if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
+          GV->getAlignment() < PrefAlign &&
           DL->getTypeAllocSize(GV->getType()->getElementType()) >=
               MinSize + Offset2)
         GV->setAlignment(PrefAlign);
@@ -5211,6 +5211,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
   return false;
 }
 
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+static bool makeBitReverse(Instruction &I, const DataLayout &DL,
+                           const TargetLowering &TLI) {
+  if (!I.getType()->isIntegerTy() ||
+      !TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
+                                    TLI.getValueType(DL, I.getType(), true)))
+    return false;
+
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBitReverseOrBSwapIdiom(&I, false, true, Insts))
+    return false;
+  Instruction *LastInst = Insts.back();
+  I.replaceAllUsesWith(LastInst);
+  RecursivelyDeleteTriviallyDeadInstructions(&I);
+  return true;
+}
+
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
@@ -5224,8 +5242,19 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
     if (ModifiedDT)
       return true;
   }
-  MadeChange |= dupRetToEnableTailCallOpts(&BB);
 
+  bool MadeBitReverse = true;
+  while (TLI && MadeBitReverse) {
+    MadeBitReverse = false;
+    for (auto &I : reverse(BB)) {
+      if (makeBitReverse(I, *DL, *TLI)) {
+        MadeBitReverse = MadeChange = true;
+        break;
+      }
+    }
+  }
+  MadeChange |= dupRetToEnableTailCallOpts(&BB);
+  
   return MadeChange;
 }
 
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index ca4bb1c..f6604f3 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -163,7 +163,7 @@ getOrCreateJumpTableInfo(unsigned EntryKind) {
 }
 
 /// Should we be emitting segmented stack stuff for the function
-bool MachineFunction::shouldSplitStack() {
+bool MachineFunction::shouldSplitStack() const {
   return getFunction()->hasFnAttribute("split-stack");
 }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 96bf914..893871f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -377,22 +377,6 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
   }
 }
 
-/// Add logical or fast math flag values to FoldingSetNodeID value.
-static void AddNodeIDFlags(FoldingSetNodeID &ID, unsigned Opcode,
-                           const SDNodeFlags *Flags) {
-  if (!isBinOpWithFlags(Opcode))
-    return;
-
-  unsigned RawFlags = 0;
-  if (Flags)
-    RawFlags = Flags->getRawFlags();
-  ID.AddInteger(RawFlags);
-}
-
-static void AddNodeIDFlags(FoldingSetNodeID &ID, const SDNode *N) {
-  AddNodeIDFlags(ID, N->getOpcode(), N->getFlags());
-}
-
 static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
                           SDVTList VTList, ArrayRef<SDValue> OpList) {
   AddNodeIDOpcode(ID, OpC);
@@ -528,8 +512,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
   }
   } // end switch (N->getOpcode())
 
-  AddNodeIDFlags(ID, N);
-
   // Target specific memory nodes could also have address spaces to check.
   if (N->isTargetMemoryOpcode())
     ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
@@ -851,6 +833,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
   return Node;
 }
 
@@ -869,6 +854,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
   return Node;
 }
 
@@ -886,6 +874,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
   return Node;
 }
 
@@ -3892,10 +3883,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     SDValue Ops[] = {N1, N2};
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
-    AddNodeIDFlags(ID, Opcode, Flags);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) {
+      if (Flags)
+        E->intersectFlagsWith(Flags);
       return SDValue(E, 0);
+    }
 
     N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags);
 
@@ -6249,10 +6242,12 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
   if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
-    AddNodeIDFlags(ID, Opcode, Flags);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP)) {
+      if (Flags)
+        E->intersectFlagsWith(Flags);
       return E;
+    }
   }
   return nullptr;
 }
@@ -6948,6 +6943,11 @@ const SDNodeFlags *SDNode::getFlags() const {
   return nullptr;
 }
 
+void SDNode::intersectFlagsWith(const SDNodeFlags *Flags) {
+  if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))
+    FlagsNode->Flags.intersectWith(Flags);
+}
+
 SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
   assert(N->getNumValues() == 1 &&
          "Can't unroll a vector with multiple results!");
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 6159f93..a61b62b 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -12,11 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -134,6 +135,47 @@ bool GlobalValue::isDeclaration() const {
   return false;
 }
 
+bool GlobalValue::canIncreaseAlignment() const {
+  // Firstly, can only increase the alignment of a global if it
+  // is a strong definition.
+  if (!isStrongDefinitionForLinker())
+    return false;
+
+  // It also has to either not have a section defined, or, not have
+  // alignment specified. (If it is assigned a section, the global
+  // could be densely packed with other objects in the section, and
+  // increasing the alignment could cause padding issues.)
+  if (hasSection() && getAlignment() > 0)
+    return false;
+
+  // On ELF platforms, we're further restricted in that we can't
+  // increase the alignment of any variable which might be emitted
+  // into a shared library, and which is exported. If the main
+  // executable accesses a variable found in a shared-lib, the main
+  // exe actually allocates memory for and exports the symbol ITSELF,
+  // overriding the symbol found in the library. That is, at link
+  // time, the observed alignment of the variable is copied into the
+  // executable binary. (A COPY relocation is also generated, to copy
+  // the initial data from the shadowed variable in the shared-lib
+  // into the location in the main binary, before running code.)
+  //
+  // And thus, even though you might think you are defining the
+  // global, and allocating the memory for the global in your object
+  // file, and thus should be able to set the alignment arbitrarily,
+  // that's not actually true. Doing so can cause an ABI breakage; an
+  // executable might have already been built with the previous
+  // alignment of the variable, and then assuming an increased
+  // alignment will be incorrect.
+
+  // Conservatively assume ELF if there's no parent pointer.
+  bool isELF =
+      (!Parent || Triple(Parent->getTargetTriple()).isOSBinFormatELF());
+  if (isELF && hasDefaultVisibility() && !hasLocalLinkage())
+    return false;
+
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // GlobalVariable Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4ecfbe9..9b73c5e 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10133,6 +10133,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
 
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (AArch64::GPR64RegClass.contains(*I))
@@ -10152,13 +10153,13 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
-    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            NewVR)
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
+    // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-              *I)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index d26604f..685907a 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -112,9 +112,21 @@ public:
     MCELFStreamer::EmitInstruction(Inst, STI);
   }
 
+  /// Emit a 32-bit value as an instruction. This is only used for the .inst
+  /// directive, EmitInstruction should be used in other cases.
   void emitInst(uint32_t Inst) {
+    char Buffer[4];
+
+    // We can't just use EmitIntValue here, as that will emit a data mapping
+    // symbol, and swap the endianness on big-endian systems (instructions are
+    // always little-endian).
+    for (unsigned I = 0; I < 4; ++I) {
+      Buffer[I] = uint8_t(Inst);
+      Inst >>= 8;
+    }
+
     EmitA64MappingSymbol();
-    MCELFStreamer::EmitIntValue(Inst, 4);
+    MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 37c0795..978e99c 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -12423,6 +12423,7 @@ void ARMTargetLowering::insertCopiesSplitCSR(
 
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (ARM::GPRRegClass.contains(*I))
@@ -12442,13 +12443,13 @@ void ARMTargetLowering::insertCopiesSplitCSR(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
-    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            NewVR)
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
+    // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-              *I)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index e8b96e7..ed2e880 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -832,10 +832,10 @@ def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
                                              R8, R9, R10, R11)>;
 
 // CSRs that are handled by prologue, epilogue.
-def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>;
+def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add RBP)>;
 
 // CSRs that are handled explicitly via copies.
-def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>;
+def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)>;
 
 // All GPRs - except r11
 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 8632bb8..7f8ce47 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -2031,6 +2031,10 @@ void X86FrameLowering::adjustForSegmentedStacks(
   unsigned TlsReg, TlsOffset;
   DebugLoc DL;
 
+  // To support shrink-wrapping we would need to insert the new blocks
+  // at the right place and update the branches to PrologueMBB.
+  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
   unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
   assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
          "Scratch register is live-in");
@@ -2271,6 +2275,11 @@ void X86FrameLowering::adjustForHiPEPrologue(
     MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   DebugLoc DL;
+
+  // To support shrink-wrapping we would need to insert the new blocks
+  // at the right place and update the branches to PrologueMBB.
+  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
   // HiPE-specific values
   const unsigned HipeLeafWords = 24;
   const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
@@ -2584,7 +2593,14 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
 bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
   // If we may need to emit frameless compact unwind information, give
   // up as this is currently broken: PR25614.
-  return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF);
+  return (MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) &&
+         // The lowering of segmented stack and HiPE only support entry blocks
+         // as prologue blocks: PR26107.
+         // This limitation may be lifted if we fix:
+         // - adjustForSegmentedStacks
+         // - adjustForHiPEPrologue
+         MF.getFunction()->getCallingConv() != CallingConv::HiPE &&
+         !MF.shouldSplitStack();
 }
 
 MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b723059..6904714 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -28908,6 +28908,7 @@ void X86TargetLowering::insertCopiesSplitCSR(
 
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (X86::GR64RegClass.contains(*I))
@@ -28925,13 +28926,13 @@ void X86TargetLowering::insertCopiesSplitCSR(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
-    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            NewVR)
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
+    // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-              *I)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 95c50d3..76cefd9 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -1565,190 +1566,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
-
-/// Analyze the specified subexpression and see if it is capable of providing
-/// pieces of a bswap or bitreverse. The subexpression provides a potential
-/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
-/// the output of the expression came from a corresponding bit in some other
-/// value. This function is recursive, and the end result is a mapping of
-/// (value, bitnumber) to bitnumber. It is the caller's responsibility to
-/// validate that all `value`s are identical and that the bitnumber to bitnumber
-/// mapping is correct for a bswap or bitreverse.
-///
-/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
-/// that the expression deposits the low byte of %X into the high byte of the
-/// result and that all other bits are zero. This expression is accepted,
-/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7].
-///
-/// This function returns true if the match was unsuccessful and false if so.
-/// On entry to the function the "OverallLeftShift" is a signed integer value
-/// indicating the number of bits that the subexpression is later shifted.  For
-/// example, if the expression is later right shifted by 16 bits, the
-/// OverallLeftShift value would be -16 on entry.  This is used to specify which
-/// bits of BitValues are actually being set.
-///
-/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding
-/// bit is masked to zero by a user.  For example, in (X & 255), X will be
-/// processed with a bytemask of 255. BitMask is always in the local
-/// (OverallLeftShift) coordinate space.
-///
-static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask,
-                            SmallVectorImpl<Value *> &BitValues,
-                            SmallVectorImpl<int> &BitProvenance) {
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
-    // If this is an or instruction, it may be an inner node of the bswap.
-    if (I->getOpcode() == Instruction::Or)
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance) ||
-             CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-
-    // If this is a logical shift by a constant, recurse with OverallLeftShift
-    // and BitMask adjusted.
-    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
-      unsigned ShAmt =
-          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
-      // Ensure the shift amount is defined.
-      if (ShAmt > BitValues.size())
-        return true;
-
-      unsigned BitShift = ShAmt;
-      if (I->getOpcode() == Instruction::Shl) {
-        // X << C -> collect(X, +C)
-        OverallLeftShift += BitShift;
-        BitMask = BitMask.lshr(BitShift);
-      } else {
-        // X >>u C -> collect(X, -C)
-        OverallLeftShift -= BitShift;
-        BitMask = BitMask.shl(BitShift);
-      }
-
-      if (OverallLeftShift >= (int)BitValues.size())
-        return true;
-      if (OverallLeftShift <= -(int)BitValues.size())
-        return true;
-
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-    }
-
-    // If this is a logical 'and' with a mask that clears bits, clear the
-    // corresponding bits in BitMask.
-    if (I->getOpcode() == Instruction::And &&
-        isa<ConstantInt>(I->getOperand(1))) {
-      unsigned NumBits = BitValues.size();
-      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
-      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
-
-      for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) {
-        // If this bit is masked out by a later operation, we don't care what
-        // the and mask is.
-        if (BitMask[i] == 0)
-          continue;
-
-        // If the AndMask is zero for this bit, clear the bit.
-        APInt MaskB = AndMask & Bit;
-        if (MaskB == 0) {
-          BitMask.clearBit(i);
-          continue;
-        }
-
-        // Otherwise, this bit is kept.
-      }
-
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-    }
-  }
-
-  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
-  // the input value to the bswap/bitreverse. To be part of a bswap or
-  // bitreverse we must be demanding a contiguous range of bits from it.
-  unsigned InputBitLen = BitMask.countPopulation();
-  unsigned InputBitNo = BitMask.countTrailingZeros();
-  if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo !=
-      InputBitLen)
-    // Not a contiguous set range of bits!
-    return true;
-
-  // We know we're moving a contiguous range of bits from the input to the
-  // output. Record which bits in the output came from which bits in the input.
-  unsigned DestBitNo = InputBitNo + OverallLeftShift;
-  for (unsigned I = 0; I < InputBitLen; ++I)
-    BitProvenance[DestBitNo + I] = InputBitNo + I;
-
-  // If the destination bit value is already defined, the values are or'd
-  // together, which isn't a bswap/bitreverse (unless it's an or of the same
-  // bits).
-  if (BitValues[DestBitNo] && BitValues[DestBitNo] != V)
-    return true;
-  for (unsigned I = 0; I < InputBitLen; ++I)
-    BitValues[DestBitNo + I] = V;
-
-  return false;
-}
-
-static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
-                                          unsigned BitWidth) {
-  if (From % 8 != To % 8)
-    return false;
-  // Convert from bit indices to byte indices and check for a byte reversal.
-  From >>= 3;
-  To >>= 3;
-  BitWidth >>= 3;
-  return From == BitWidth - To - 1;
-}
-
-static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
-                                               unsigned BitWidth) {
-  return From == BitWidth - To - 1;
-}
-
 /// Given an OR instruction, check to see if this is a bswap or bitreverse
 /// idiom. If so, insert the new intrinsic and return it.
 Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) {
-  IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
-  if (!ITy)
-    return nullptr;   // Can't do vectors.
-  unsigned BW = ITy->getBitWidth();
-  
-  /// We keep track of which bit (BitProvenance) inside which value (BitValues)
-  /// defines each bit in the result.
-  SmallVector<Value *, 8> BitValues(BW, nullptr);
-  SmallVector<int, 8> BitProvenance(BW, -1);
-  
-  // Try to find all the pieces corresponding to the bswap.
-  APInt BitMask = APInt::getAllOnesValue(BitValues.size());
-  if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance))
-    return nullptr;
-
-  // Check to see if all of the bits come from the same value.
-  Value *V = BitValues[0];
-  if (!V) return nullptr;  // Didn't find a bit?  Must be zero.
-
-  if (!std::all_of(BitValues.begin(), BitValues.end(),
-                   [&](const Value *X) { return X == V; }))
-    return nullptr;
-
-  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
-  // only byteswap values with an even number of bytes.
-  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;;
-  for (unsigned i = 0, e = BitValues.size(); i != e; ++i) {
-    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
-    OKForBitReverse &=
-        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
-  }
-
-  Intrinsic::ID Intrin;
-  if (OKForBSwap)
-    Intrin = Intrinsic::bswap;
-  else if (OKForBitReverse)
-    Intrin = Intrinsic::bitreverse;
-  else
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBitReverseOrBSwapIdiom(&I, true, false, Insts))
     return nullptr;
+  Instruction *LastInst = Insts.pop_back_val();
+  LastInst->removeFromParent();
 
-  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy);
-  return CallInst::Create(F, V);
+  for (auto *Inst : Insts)
+    Worklist.Add(Inst);
+  return LastInst;
 }
 
 /// We have an expression of the form (A&C)|(B&D).  Check if A is (cond?-1:0)
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 1457411..79282a2 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -179,13 +179,244 @@ void LandingPadInliningInfo::forwardResume(
   RI->eraseFromParent();
 }
 
+/// Helper for getUnwindDestToken/getUnwindDestTokenHelper.
+static Value *getParentPad(Value *EHPad) {
+  if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad))
+    return FPI->getParentPad();
+  return cast<CatchSwitchInst>(EHPad)->getParentPad();
+}
+
+typedef DenseMap<Instruction *, Value *> UnwindDestMemoTy;
+
+/// Helper for getUnwindDestToken that does the descendant-ward part of
+/// the search.
+static Value *getUnwindDestTokenHelper(Instruction *EHPad,
+                                       UnwindDestMemoTy &MemoMap) {
+  SmallVector<Instruction *, 8> Worklist(1, EHPad);
+
+  while (!Worklist.empty()) {
+    Instruction *CurrentPad = Worklist.pop_back_val();
+    // We only put pads on the worklist that aren't in the MemoMap.  When
+    // we find an unwind dest for a pad we may update its ancestors, but
+    // the queue only ever contains uncles/great-uncles/etc. of CurrentPad,
+    // so they should never get updated while queued on the worklist.
+    assert(!MemoMap.count(CurrentPad));
+    Value *UnwindDestToken = nullptr;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(CurrentPad)) {
+      if (CatchSwitch->hasUnwindDest()) {
+        UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI();
+      } else {
+        // Catchswitch doesn't have a 'nounwind' variant, and one might be
+        // annotated as "unwinds to caller" when really it's nounwind (see
+        // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the
+        // parent's unwind dest from this.  We can check its catchpads'
+        // descendants, since they might include a cleanuppad with an
+        // "unwinds to caller" cleanupret, which can be trusted.
+        for (auto HI = CatchSwitch->handler_begin(),
+                  HE = CatchSwitch->handler_end();
+             HI != HE && !UnwindDestToken; ++HI) {
+          BasicBlock *HandlerBlock = *HI;
+          auto *CatchPad = cast<CatchPadInst>(HandlerBlock->getFirstNonPHI());
+          for (User *Child : CatchPad->users()) {
+            // Intentionally ignore invokes here -- since the catchswitch is
+            // marked "unwind to caller", it would be a verifier error if it
+            // contained an invoke which unwinds out of it, so any invoke we'd
+            // encounter must unwind to some child of the catch.
+            if (!isa<CleanupPadInst>(Child) && !isa<CatchSwitchInst>(Child))
+              continue;
+
+            Instruction *ChildPad = cast<Instruction>(Child);
+            auto Memo = MemoMap.find(ChildPad);
+            if (Memo == MemoMap.end()) {
+              // Haven't figure out this child pad yet; queue it.
+              Worklist.push_back(ChildPad);
+              continue;
+            }
+            // We've already checked this child, but might have found that
+            // it offers no proof either way.
+            Value *ChildUnwindDestToken = Memo->second;
+            if (!ChildUnwindDestToken)
+              continue;
+            // We already know the child's unwind dest, which can either
+            // be ConstantTokenNone to indicate unwind to caller, or can
+            // be another child of the catchpad.  Only the former indicates
+            // the unwind dest of the catchswitch.
+            if (isa<ConstantTokenNone>(ChildUnwindDestToken)) {
+              UnwindDestToken = ChildUnwindDestToken;
+              break;
+            }
+            assert(getParentPad(ChildUnwindDestToken) == CatchPad);
+          }
+        }
+      }
+    } else {
+      auto *CleanupPad = cast<CleanupPadInst>(CurrentPad);
+      for (User *U : CleanupPad->users()) {
+        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+          if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest())
+            UnwindDestToken = RetUnwindDest->getFirstNonPHI();
+          else
+            UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext());
+          break;
+        }
+        Value *ChildUnwindDestToken;
+        if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+          ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI();
+        } else if (isa<CleanupPadInst>(U) || isa<CatchSwitchInst>(U)) {
+          Instruction *ChildPad = cast<Instruction>(U);
+          auto Memo = MemoMap.find(ChildPad);
+          if (Memo == MemoMap.end()) {
+            // Haven't resolved this child yet; queue it and keep searching.
+            Worklist.push_back(ChildPad);
+            continue;
+          }
+          // We've checked this child, but still need to ignore it if it
+          // had no proof either way.
+          ChildUnwindDestToken = Memo->second;
+          if (!ChildUnwindDestToken)
+            continue;
+        } else {
+          // Not a relevant user of the cleanuppad
+          continue;
+        }
+        // In a well-formed program, the child/invoke must either unwind to
+        // an(other) child of the cleanup, or exit the cleanup.  In the
+        // first case, continue searching.
+        if (isa<Instruction>(ChildUnwindDestToken) &&
+            getParentPad(ChildUnwindDestToken) == CleanupPad)
+          continue;
+        UnwindDestToken = ChildUnwindDestToken;
+        break;
+      }
+    }
+    // If we haven't found an unwind dest for CurrentPad, we may have queued its
+    // children, so move on to the next in the worklist.
+    if (!UnwindDestToken)
+      continue;
+
+    // Now we know that CurrentPad unwinds to UnwindDestToken.  It also exits
+    // any ancestors of CurrentPad up to but not including UnwindDestToken's
+    // parent pad.  Record this in the memo map, and check to see if the
+    // original EHPad being queried is one of the ones exited.
+    Value *UnwindParent;
+    if (auto *UnwindPad = dyn_cast<Instruction>(UnwindDestToken))
+      UnwindParent = getParentPad(UnwindPad);
+    else
+      UnwindParent = nullptr;
+    bool ExitedOriginalPad = false;
+    for (Instruction *ExitedPad = CurrentPad;
+         ExitedPad && ExitedPad != UnwindParent;
+         ExitedPad = dyn_cast<Instruction>(getParentPad(ExitedPad))) {
+      // Skip over catchpads since they just follow their catchswitches.
+      if (isa<CatchPadInst>(ExitedPad))
+        continue;
+      MemoMap[ExitedPad] = UnwindDestToken;
+      ExitedOriginalPad |= (ExitedPad == EHPad);
+    }
+
+    if (ExitedOriginalPad)
+      return UnwindDestToken;
+
+    // Continue the search.
+  }
+
+  // No definitive information is contained within this funclet.
+  return nullptr;
+}
+
+/// Given an EH pad, find where it unwinds.  If it unwinds to an EH pad,
+/// return that pad instruction.  If it unwinds to caller, return
+/// ConstantTokenNone.  If it does not have a definitive unwind destination,
+/// return nullptr.
+///
+/// This routine gets invoked for calls in funclets in inlinees when inlining
+/// an invoke.  Since many funclets don't have calls inside them, it's queried
+/// on-demand rather than building a map of pads to unwind dests up front.
+/// Determining a funclet's unwind dest may require recursively searching its
+/// descendants, and also ancestors and cousins if the descendants don't provide
+/// an answer.  Since most funclets will have their unwind dest immediately
+/// available as the unwind dest of a catchswitch or cleanupret, this routine
+/// searches top-down from the given pad and then up. To avoid worst-case
+/// quadratic run-time given that approach, it uses a memo map to avoid
+/// re-processing funclet trees.  The callers that rewrite the IR as they go
+/// take advantage of this, for correctness, by checking/forcing rewritten
+/// pads' entries to match the original callee view.
+static Value *getUnwindDestToken(Instruction *EHPad,
+                                 UnwindDestMemoTy &MemoMap) {
+  // Catchpads unwind to the same place as their catchswitch;
+  // redirct any queries on catchpads so the code below can
+  // deal with just catchswitches and cleanuppads.
+  if (auto *CPI = dyn_cast<CatchPadInst>(EHPad))
+    EHPad = CPI->getCatchSwitch();
+
+  // Check if we've already determined the unwind dest for this pad.
+  auto Memo = MemoMap.find(EHPad);
+  if (Memo != MemoMap.end())
+    return Memo->second;
+
+  // Search EHPad and, if necessary, its descendants.
+  Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap);
+  assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0));
+  if (UnwindDestToken)
+    return UnwindDestToken;
+
+  // No information is available for this EHPad from itself or any of its
+  // descendants.  An unwind all the way out to a pad in the caller would
+  // need also to agree with the unwind dest of the parent funclet, so
+  // search up the chain to try to find a funclet with information.  Put
+  // null entries in the memo map to avoid re-processing as we go up.
+  MemoMap[EHPad] = nullptr;
+  Instruction *LastUselessPad = EHPad;
+  Value *AncestorToken;
+  for (AncestorToken = getParentPad(EHPad);
+       auto *AncestorPad = dyn_cast<Instruction>(AncestorToken);
+       AncestorToken = getParentPad(AncestorToken)) {
+    // Skip over catchpads since they just follow their catchswitches.
+    if (isa<CatchPadInst>(AncestorPad))
+      continue;
+    assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]);
+    auto AncestorMemo = MemoMap.find(AncestorPad);
+    if (AncestorMemo == MemoMap.end()) {
+      UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap);
+    } else {
+      UnwindDestToken = AncestorMemo->second;
+    }
+    if (UnwindDestToken)
+      break;
+    LastUselessPad = AncestorPad;
+  }
+
+  // Since the whole tree under LastUselessPad has no information, it all must
+  // match UnwindDestToken; record that to avoid repeating the search.
+  SmallVector<Instruction *, 8> Worklist(1, LastUselessPad);
+  while (!Worklist.empty()) {
+    Instruction *UselessPad = Worklist.pop_back_val();
+    assert(!MemoMap.count(UselessPad) || MemoMap[UselessPad] == nullptr);
+    MemoMap[UselessPad] = UnwindDestToken;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) {
+      for (BasicBlock *HandlerBlock : CatchSwitch->handlers())
+        for (User *U : HandlerBlock->getFirstNonPHI()->users())
+          if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+            Worklist.push_back(cast<Instruction>(U));
+    } else {
+      assert(isa<CleanupPadInst>(UselessPad));
+      for (User *U : UselessPad->users())
+        if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+          Worklist.push_back(cast<Instruction>(U));
+    }
+  }
+
+  return UnwindDestToken;
+}
+
 /// When we inline a basic block into an invoke,
 /// we have to turn all of the calls that can throw into invokes.
 /// This function analyze BB to see if there are any calls, and if so,
 /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
 /// nodes in that block with the values specified in InvokeDestPHIValues.
-static BasicBlock *
-HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {
+static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
+    BasicBlock *BB, BasicBlock *UnwindEdge,
+    UnwindDestMemoTy *FuncletUnwindMap = nullptr) {
   for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
     Instruction *I = &*BBI++;
 
@@ -196,6 +427,31 @@ HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {
     if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
       continue;
 
+    if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
+      // This call is nested inside a funclet.  If that funclet has an unwind
+      // destination within the inlinee, then unwinding out of this call would
+      // be UB.  Rewriting this call to an invoke which targets the inlined
+      // invoke's unwind dest would give the call's parent funclet multiple
+      // unwind destinations, which is something that subsequent EH table
+      // generation can't handle and that the veirifer rejects.  So when we
+      // see such a call, leave it as a call.
+      auto *FuncletPad = cast<Instruction>(FuncletBundle->Inputs[0]);
+      Value *UnwindDestToken =
+          getUnwindDestToken(FuncletPad, *FuncletUnwindMap);
+      if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+        continue;
+#ifndef NDEBUG
+      Instruction *MemoKey;
+      if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
+        MemoKey = CatchPad->getCatchSwitch();
+      else
+        MemoKey = FuncletPad;
+      assert(FuncletUnwindMap->count(MemoKey) &&
+             (*FuncletUnwindMap)[MemoKey] == UnwindDestToken &&
+             "must get memoized to avoid confusing later searches");
+#endif // NDEBUG
+    }
+
     // Convert this function call into an invoke instruction.  First, split the
     // basic block.
     BasicBlock *Split =
@@ -328,13 +584,23 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
 
   // This connects all the instructions which 'unwind to caller' to the invoke
   // destination.
+  UnwindDestMemoTy FuncletUnwindMap;
   for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
        BB != E; ++BB) {
     if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
       if (CRI->unwindsToCaller()) {
-        CleanupReturnInst::Create(CRI->getCleanupPad(), UnwindDest, CRI);
+        auto *CleanupPad = CRI->getCleanupPad();
+        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI);
         CRI->eraseFromParent();
         UpdatePHINodes(&*BB);
+        // Finding a cleanupret with an unwind destination would confuse
+        // subsequent calls to getUnwindDestToken, so map the cleanuppad
+        // to short-circuit any such calls and recognize this as an "unwind
+        // to caller" cleanup.
+        assert(!FuncletUnwindMap.count(CleanupPad) ||
+               isa<ConstantTokenNone>(FuncletUnwindMap[CleanupPad]));
+        FuncletUnwindMap[CleanupPad] =
+            ConstantTokenNone::get(Caller->getContext());
       }
     }
 
@@ -345,12 +611,41 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
     Instruction *Replacement = nullptr;
     if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
       if (CatchSwitch->unwindsToCaller()) {
+        Value *UnwindDestToken;
+        if (auto *ParentPad =
+                dyn_cast<Instruction>(CatchSwitch->getParentPad())) {
+          // This catchswitch is nested inside another funclet.  If that
+          // funclet has an unwind destination within the inlinee, then
+          // unwinding out of this catchswitch would be UB.  Rewriting this
+          // catchswitch to unwind to the inlined invoke's unwind dest would
+          // give the parent funclet multiple unwind destinations, which is
+          // something that subsequent EH table generation can't handle and
+          // that the veirifer rejects.  So when we see such a call, leave it
+          // as "unwind to caller".
+          UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap);
+          if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+            continue;
+        } else {
+          // This catchswitch has no parent to inherit constraints from, and
+          // none of its descendants can have an unwind edge that exits it and
+          // targets another funclet in the inlinee.  It may or may not have a
+          // descendant that definitively has an unwind to caller.  In either
+          // case, we'll have to assume that any unwinds out of it may need to
+          // be routed to the caller, so treat it as though it has a definitive
+          // unwind to caller.
+          UnwindDestToken = ConstantTokenNone::get(Caller->getContext());
+        }
         auto *NewCatchSwitch = CatchSwitchInst::Create(
             CatchSwitch->getParentPad(), UnwindDest,
             CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
             CatchSwitch);
         for (BasicBlock *PadBB : CatchSwitch->handlers())
           NewCatchSwitch->addHandler(PadBB);
+        // Propagate info for the old catchswitch over to the new one in
+        // the unwind map.  This also serves to short-circuit any subsequent
+        // checks for the unwind dest of this catchswitch, which would get
+        // confused if they found the outer handler in the callee.
+        FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken;
         Replacement = NewCatchSwitch;
       }
     } else if (!isa<FuncletPadInst>(I)) {
@@ -369,8 +664,8 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
     for (Function::iterator BB = FirstNewBlock->getIterator(),
                             E = Caller->end();
          BB != E; ++BB)
-      if (BasicBlock *NewBB =
-              HandleCallsInBlockInlinedThroughInvoke(&*BB, UnwindDest))
+      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+              &*BB, UnwindDest, &FuncletUnwindMap))
         // Update any PHI nodes in the exceptional block to indicate that there
         // is now a new entry in them.
         UpdatePHINodes(NewBB);
@@ -1415,6 +1710,20 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
+  // If we are inlining for an invoke instruction, we must make sure to rewrite
+  // any call instructions into invoke instructions.  This is sensitive to which
+  // funclet pads were top-level in the inlinee, so must be done before
+  // rewriting the "parent pad" links.
+  if (auto *II = dyn_cast<InvokeInst>(TheCall)) {
+    BasicBlock *UnwindDest = II->getUnwindDest();
+    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
+    if (isa<LandingPadInst>(FirstNonPHI)) {
+      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    } else {
+      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    }
+  }
+
   // Update the lexical scopes of the new funclets and callsites.
   // Anything that had 'none' as its parent is now nested inside the callsite's
   // EHPad.
@@ -1472,18 +1781,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
-  // If we are inlining for an invoke instruction, we must make sure to rewrite
-  // any call instructions into invoke instructions.
-  if (auto *II = dyn_cast<InvokeInst>(TheCall)) {
-    BasicBlock *UnwindDest = II->getUnwindDest();
-    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
-    if (isa<LandingPadInst>(FirstNonPHI)) {
-      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
-    } else {
-      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
-    }
-  }
-
   // Handle any inlined musttail call sites.  In order for a new call site to be
   // musttail, the source of the clone and the inlined call site must have been
   // musttail.  Therefore it's safe to return without merging control into the
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index d2793e5..abc9b65 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -944,37 +944,44 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 static unsigned enforceKnownAlignment(Value *V, unsigned Align,
                                       unsigned PrefAlign,
                                       const DataLayout &DL) {
+  assert(PrefAlign > Align);
+
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    // TODO: ideally, computeKnownBits ought to have used
+    // AllocaInst::getAlignment() in its computation already, making
+    // the below max redundant. But, as it turns out,
+    // stripPointerCasts recurses through infinite layers of bitcasts,
+    // while computeKnownBits is not allowed to traverse more than 6
+    // levels.
+    Align = std::max(AI->getAlignment(), Align);
+    if (PrefAlign <= Align)
+      return Align;
+
     // If the preferred alignment is greater than the natural stack alignment
     // then don't round up. This avoids dynamic stack realignment.
     if (DL.exceedsNaturalStackAlignment(PrefAlign))
       return Align;
-    // If there is a requested alignment and if this is an alloca, round up.
-    if (AI->getAlignment() >= PrefAlign)
-      return AI->getAlignment();
     AI->setAlignment(PrefAlign);
     return PrefAlign;
   }
 
   if (auto *GO = dyn_cast<GlobalObject>(V)) {
+    // TODO: as above, this shouldn't be necessary.
+    Align = std::max(GO->getAlignment(), Align);
+    if (PrefAlign <= Align)
+      return Align;
+
     // If there is a large requested alignment and we can, bump up the alignment
     // of the global.  If the memory we set aside for the global may not be the
     // memory used by the final program then it is impossible for us to reliably
     // enforce the preferred alignment.
-    if (!GO->isStrongDefinitionForLinker())
+    if (!GO->canIncreaseAlignment())
       return Align;
 
-    if (GO->getAlignment() >= PrefAlign)
-      return GO->getAlignment();
-    // We can only increase the alignment of the global if it has no alignment
-    // specified or if it is not assigned a section.  If it is assigned a
-    // section, the global could be densely packed with other objects in the
-    // section, increasing the alignment could cause padding issues.
-    if (!GO->hasSection() || GO->getAlignment() == 0)
-      GO->setAlignment(PrefAlign);
-    return GO->getAlignment();
+    GO->setAlignment(PrefAlign);
+    return PrefAlign;
   }
 
   return Align;
@@ -1585,3 +1592,205 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
 
   return false;
 }
+
+/// A potential constituent of a bitreverse or bswap expression. See
+/// collectBitParts for a fuller explanation.
+struct BitPart {
+  BitPart(Value *P, unsigned BW) : Provider(P) {
+    Provenance.resize(BW);
+  }
+
+  /// The Value that this is a bitreverse/bswap of.
+  Value *Provider;
+  /// The "provenance" of each bit. Provenance[A] = B means that bit A
+  /// in Provider becomes bit B in the result of this expression.
+  SmallVector<int8_t, 32> Provenance; // int8_t means max size is i128.
+
+  enum { Unset = -1 };
+};
+
+/// Analyze the specified subexpression and see if it is capable of providing
+/// pieces of a bswap or bitreverse. The subexpression provides a potential
+/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
+/// the output of the expression came from a corresponding bit in some other
+/// value. This function is recursive, and the end result is a mapping of
+/// bitnumber to bitnumber. It is the caller's responsibility to validate that
+/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse.
+///
+/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
+/// that the expression deposits the low byte of %X into the high byte of the
+/// result and that all other bits are zero. This expression is accepted and a
+/// BitPart is returned with Provider set to %X and Provenance[24-31] set to
+/// [0-7].
+///
+/// To avoid revisiting values, the BitPart results are memoized into the
+/// provided map. To avoid unnecessary copying of BitParts, BitParts are
+/// constructed in-place in the \c BPS map. Because of this \c BPS needs to
+/// store BitParts objects, not pointers. As we need the concept of a nullptr
+/// BitParts (Value has been analyzed and the analysis failed), we an Optional
+/// type instead to provide the same functionality.
+///
+/// Because we pass around references into \c BPS, we must use a container that
+/// does not invalidate internal references (std::map instead of DenseMap).
+///
+static const Optional<BitPart> &
+collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
+                std::map<Value *, Optional<BitPart>> &BPS) {
+  auto I = BPS.find(V);
+  if (I != BPS.end())
+    return I->second;
+
+  auto &Result = BPS[V] = None;
+  auto BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If this is an or instruction, it may be an inner node of the bswap.
+    if (I->getOpcode() == Instruction::Or) {
+      auto &A = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      auto &B = collectBitParts(I->getOperand(1), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      if (!A || !B)
+        return Result;
+
+      // Try and merge the two together.
+      if (!A->Provider || A->Provider != B->Provider)
+        return Result;
+
+      Result = BitPart(A->Provider, BitWidth);
+      for (unsigned i = 0; i < A->Provenance.size(); ++i) {
+        if (A->Provenance[i] != BitPart::Unset &&
+            B->Provenance[i] != BitPart::Unset &&
+            A->Provenance[i] != B->Provenance[i])
+          return Result = None;
+
+        if (A->Provenance[i] == BitPart::Unset)
+          Result->Provenance[i] = B->Provenance[i];
+        else
+          Result->Provenance[i] = A->Provenance[i];
+      }
+
+      return Result;
+    }
+
+    // If this is a logical shift by a constant, recurse then shift the result.
+    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
+      unsigned BitShift =
+          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      // Ensure the shift amount is defined.
+      if (BitShift > BitWidth)
+        return Result;
+
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      // Perform the "shift" on BitProvenance.
+      auto &P = Result->Provenance;
+      if (I->getOpcode() == Instruction::Shl) {
+        P.erase(std::prev(P.end(), BitShift), P.end());
+        P.insert(P.begin(), BitShift, BitPart::Unset);
+      } else {
+        P.erase(P.begin(), std::next(P.begin(), BitShift));
+        P.insert(P.end(), BitShift, BitPart::Unset);
+      }
+
+      return Result;
+    }
+
+    // If this is a logical 'and' with a mask that clears bits, recurse then
+    // unset the appropriate bits.
+    if (I->getOpcode() == Instruction::And &&
+        isa<ConstantInt>(I->getOperand(1))) {
+      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
+      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+
+      // Check that the mask allows a multiple of 8 bits for a bswap, for an
+      // early exit.
+      unsigned NumMaskedBits = AndMask.countPopulation();
+      if (!MatchBitReversals && NumMaskedBits % 8 != 0)
+        return Result;
+      
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      for (unsigned i = 0; i < BitWidth; ++i, Bit <<= 1)
+        // If the AndMask is zero for this bit, clear the bit.
+        if ((AndMask & Bit) == 0)
+          Result->Provenance[i] = BitPart::Unset;
+
+      return Result;
+    }
+  }
+
+  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
+  // the input value to the bswap/bitreverse.
+  Result = BitPart(V, BitWidth);
+  for (unsigned i = 0; i < BitWidth; ++i)
+    Result->Provenance[i] = i;
+  return Result;
+}
+
+static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
+                                          unsigned BitWidth) {
+  if (From % 8 != To % 8)
+    return false;
+  // Convert from bit indices to byte indices and check for a byte reversal.
+  From >>= 3;
+  To >>= 3;
+  BitWidth >>= 3;
+  return From == BitWidth - To - 1;
+}
+
+static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
+                                               unsigned BitWidth) {
+  return From == BitWidth - To - 1;
+}
+
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+bool llvm::recognizeBitReverseOrBSwapIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts) {
+  if (Operator::getOpcode(I) != Instruction::Or)
+    return false;
+  if (!MatchBSwaps && !MatchBitReversals)
+    return false;
+  IntegerType *ITy = dyn_cast<IntegerType>(I->getType());
+  if (!ITy || ITy->getBitWidth() > 128)
+    return false;   // Can't do vectors or integers > 128 bits.
+  unsigned BW = ITy->getBitWidth();
+
+  // Try to find all the pieces corresponding to the bswap.
+  std::map<Value *, Optional<BitPart>> BPS;
+  auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS);
+  if (!Res)
+    return false;
+  auto &BitProvenance = Res->Provenance;
+
+  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
+  // only byteswap values with an even number of bytes.
+  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;
+  for (unsigned i = 0; i < BW; ++i) {
+    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
+    OKForBitReverse &=
+        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
+  }
+
+  Intrinsic::ID Intrin;
+  if (OKForBSwap && MatchBSwaps)
+    Intrin = Intrinsic::bswap;
+  else if (OKForBitReverse && MatchBitReversals)
+    Intrin = Intrinsic::bitreverse;
+  else
+    return false;
+
+  Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy);
+  InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I));
+  return true;
+}
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index dc07440..2f3c311 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -970,15 +970,34 @@ static Value *valueHasFloatPrecision(Value *Val) {
   return nullptr;
 }
 
-//===----------------------------------------------------------------------===//
-// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+/// Any floating-point library function that we're trying to simplify will have
+/// a signature of the form: fptype foo(fptype param1, fptype param2, ...).
+/// CheckDoubleTy indicates that 'fptype' must be 'double'.
+static bool matchesFPLibFunctionSignature(const Function *F, unsigned NumParams,
+                                          bool CheckDoubleTy) {
+  FunctionType *FT = F->getFunctionType();
+  if (FT->getNumParams() != NumParams)
+    return false;
+
+  // The return type must match what we're looking for.
+  Type *RetTy = FT->getReturnType();
+  if (CheckDoubleTy ? !RetTy->isDoubleTy() : !RetTy->isFloatingPointTy())
+    return false;
+
+  // Each parameter must match the return type, and therefore, match every other
+  // parameter too.
+  for (const Type *ParamTy : FT->params())
+    if (ParamTy != RetTy)
+      return false;
 
-Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
-                                                bool CheckRetType) {
+  return true;
+}
+
+/// Shrink double -> float for unary functions like 'floor'.
+static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
+                                    bool CheckRetType) {
   Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
-      !FT->getParamType(0)->isDoubleTy())
+  if (!matchesFPLibFunctionSignature(Callee, 1, true))
     return nullptr;
 
   if (CheckRetType) {
@@ -1013,15 +1032,10 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
   return B.CreateFPExt(V, B.getDoubleTy());
 }
 
-// Double -> Float Shrinking Optimizations for Binary Functions like 'fmin/fmax'
-Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
+/// Shrink double -> float for binary functions like 'fmin/fmax'.
+static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 2 arguments of the same FP type, which match the
-  // result type.
-  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      !FT->getParamType(0)->isFloatingPointTy())
+  if (!matchesFPLibFunctionSignature(Callee, 2, true))
     return nullptr;
 
   // If this is something like 'fmin((double)floatval1, (double)floatval2)',
@@ -1394,12 +1408,21 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  
+
   Value *Ret = nullptr;
   if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" ||
                                    Callee->getIntrinsicID() == Intrinsic::sqrt))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
+  // FIXME: Refactor - this check is repeated all over this file and even in the
+  // preceding call to shrink double -> float.
+
+  // Make sure this has 1 argument of FP type, which matches the result type.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return Ret;
+
   if (!CI->hasUnsafeAlgebra())
     return Ret;
 
diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll
index a9ae00c..9996c0d 100644
--- a/test/CodeGen/AArch64/cxx-tlscc.ll
+++ b/test/CodeGen/AArch64/cxx-tlscc.ll
@@ -8,6 +8,7 @@
 @sg = internal thread_local global %struct.S zeroinitializer, align 1
 @__dso_handle = external global i8
 @__tls_guard = internal thread_local unnamed_addr global i1 false
+@sum1 = internal thread_local global i32 0, align 4
 
 declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
 declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
@@ -74,3 +75,29 @@ __tls_init.exit:
 ; CHECK-NOT: ldp d27, d26
 ; CHECK-NOT: ldp d29, d28
 ; CHECK-NOT: ldp d31, d30
+
+; CHECK-LABEL: _ZTW4sum1
+; CHECK-NOT: stp d31, d30
+; CHECK-NOT: stp d29, d28
+; CHECK-NOT: stp d27, d26
+; CHECK-NOT: stp d25, d24
+; CHECK-NOT: stp d23, d22
+; CHECK-NOT: stp d21, d20
+; CHECK-NOT: stp d19, d18
+; CHECK-NOT: stp d17, d16
+; CHECK-NOT: stp d7, d6
+; CHECK-NOT: stp d5, d4
+; CHECK-NOT: stp d3, d2
+; CHECK-NOT: stp d1, d0
+; CHECK-NOT: stp x20, x19
+; CHECK-NOT: stp x14, x13
+; CHECK-NOT: stp x12, x11
+; CHECK-NOT: stp x10, x9
+; CHECK-NOT: stp x8, x7
+; CHECK-NOT: stp x6, x5
+; CHECK-NOT: stp x4, x3
+; CHECK-NOT: stp x2, x1
+; CHECK: blr
+define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
+  ret i32* @sum1
+}
diff --git a/test/CodeGen/ARM/cse-flags.ll b/test/CodeGen/ARM/cse-flags.ll
new file mode 100644
index 0000000..c18e2fc
--- /dev/null
+++ b/test/CodeGen/ARM/cse-flags.ll
@@ -0,0 +1,43 @@
+; RUN: llc -asm-verbose=false < %s | FileCheck %s
+; PR26063
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+; CHECK: .LBB0_1:
+; CHECK-NEXT: bl      f{{$}}
+; CHECK-NEXT: ldrb    r[[T0:[0-9]+]], [r{{[0-9]+}}, #1]!{{$}}
+; CHECK-NEXT: cmp     r{{[0-9]+}}, #1{{$}}
+; CHECK-NEXT: cmpne   r[[T0]], #0{{$}}
+; CHECK-NEXT: bne     .LBB0_1{{$}}
+define i8* @h(i8* readonly %a, i32 %b, i32 %c) {
+entry:
+  %0 = load i8, i8* %a, align 1
+  %tobool4 = icmp ne i8 %0, 0
+  %cmp5 = icmp ne i32 %b, 1
+  %1 = and i1 %cmp5, %tobool4
+  br i1 %1, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ]
+  %call = tail call i32 bitcast (i32 (...)* @f to i32 ()*)()
+  %incdec.ptr = getelementptr inbounds i8, i8* %a.addr.06, i32 1
+  %2 = load i8, i8* %incdec.ptr, align 1
+  %tobool = icmp ne i8 %2, 0
+  %cmp = icmp ne i32 %call, 1
+  %3 = and i1 %cmp, %tobool
+  br i1 %3, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %incdec.ptr.lcssa = phi i8* [ %incdec.ptr, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %a.addr.0.lcssa = phi i8* [ %a, %entry ], [ %incdec.ptr.lcssa, %while.end.loopexit ]
+  ret i8* %a.addr.0.lcssa
+}
+
+declare i32 @f(...)
diff --git a/test/CodeGen/ARM/cxx-tlscc.ll b/test/CodeGen/ARM/cxx-tlscc.ll
index 7b776d4..11173bb 100644
--- a/test/CodeGen/ARM/cxx-tlscc.ll
+++ b/test/CodeGen/ARM/cxx-tlscc.ll
@@ -8,6 +8,7 @@
 @sg = internal thread_local global %struct.S zeroinitializer, align 1
 @__dso_handle = external global i8
 @__tls_guard = internal thread_local unnamed_addr global i1 false
+@sum1 = internal thread_local global i32 0, align 4
 
 declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
 declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
@@ -44,3 +45,13 @@ __tls_init.exit:
 ; CHECK-NOT: pop {r9, r12}
 ; CHECK-NOT: pop {r1, r2, r3, r4, r7, pc}
 ; CHECK: pop {lr}
+
+; CHECK-LABEL: _ZTW4sum1
+; CHECK-NOT: push {r1, r2, r3, r4, r7, lr}
+; CHECK-NOT: push {r9, r12}
+; CHECK-NOT: vpush {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+; CHECK-NOT: vpush {d0, d1, d2, d3, d4, d5, d6, d7}
+; CHECK: blx
+define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
+  ret i32* @sum1
+}
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index 66743f3..46fef76 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS
-; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN
-; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
-; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
-; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
-; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
-; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=armv7-apple-ios -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-IOS --check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7m-none-macho -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-DARWIN --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-eabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-eabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK
 
 define void @f1(i8* %dest, i8* %src) {
 entry:
@@ -402,8 +402,8 @@ entry:
 ; CHECK: arr1:
 ; CHECK-IOS: .align 3
 ; CHECK-DARWIN: .align 2
-; CHECK-EABI: .align 2
-; CHECK-GNUEABI: .align 2
+; CHECK-EABI-NOT: .align
+; CHECK-GNUEABI-NOT: .align
 ; CHECK: arr2:
 ; CHECK: {{\.section.+foo,bar}}
 ; CHECK-NOT: .align
diff --git a/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll b/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll
deleted file mode 100644
index 4580795..0000000
--- a/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-; CHECK: addl
-
-; The two additions are the same , but have different flags.
-; In theory this code should never be generated by the frontend, but this 
-; tries to test that two identical instructions with two different flags
-; actually generate two different nodes.
-;
-; Normally the combiner would see this condition without the flags 
-; and optimize the result of the sub into a register clear
-; (the final result would be 0). With the different flags though the combiner 
-; needs to keep the add + sub nodes, because the two nodes result as different
-; nodes and so cannot assume that the subtraction of the two nodes
-; generates 0 as result
-define i32 @foo(i32 %a, i32 %b) {
-  %1 = add i32 %a, %b
-  %2 = add nsw i32 %a, %b
-  %3 = sub i32 %1, %2
-  ret i32 %3
-}
diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll
index 70fe501..6c8e45e 100644
--- a/test/CodeGen/X86/cxx_tlscc64.ll
+++ b/test/CodeGen/X86/cxx_tlscc64.ll
@@ -4,11 +4,13 @@
 ; tricks similar to AArch64 fast TLS calling convention (r255821).
 ; Applying tricks on x86-64 similar to r255821.
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -O0 | FileCheck %s --check-prefix=CHECK-O0
 %struct.S = type { i8 }
 
 @sg = internal thread_local global %struct.S zeroinitializer, align 1
 @__dso_handle = external global i8
 @__tls_guard = internal thread_local unnamed_addr global i1 false
+@sum1 = internal thread_local global i32 0, align 4
 
 declare void @_ZN1SC1Ev(%struct.S*)
 declare void @_ZN1SD1Ev(%struct.S*)
@@ -50,3 +52,28 @@ init.i:
 __tls_init.exit:
   ret %struct.S* @sg
 }
+
+; CHECK-LABEL: _ZTW4sum1
+; CHECK-NOT: pushq %r11
+; CHECK-NOT: pushq %r10
+; CHECK-NOT: pushq %r9
+; CHECK-NOT: pushq %r8
+; CHECK-NOT: pushq %rsi
+; CHECK-NOT: pushq %rdx
+; CHECK-NOT: pushq %rcx
+; CHECK-NOT: pushq %rbx
+; CHECK: callq
+define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
+  ret i32* @sum1
+}
+
+; Make sure at O0 we don't overwrite RBP.
+; CHECK-O0-LABEL: _ZTW4sum2
+; CHECK-O0: pushq %rbp
+; CHECK-O0: movq %rsp, %rbp
+; CHECK-O0-NOT: movq %r{{.*}}, (%rbp) 
+define cxx_fast_tlscc i32* @_ZTW4sum2() #0 {
+  ret i32* @sum1
+}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
index 7c00f40..eb87f71 100644
--- a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -1,11 +1,5 @@
 ; RUN: llc %s -o - | FileCheck %s --check-prefix=CHECK
 ;
-; This test checks that we do not use shrink-wrapping when
-; the function does not have any frame pointer and may unwind.
-; This is a workaround for a limitation in the emission of
-; the CFI directives, that are not correct in such case.
-; PR25614
-;
 ; Note: This test cannot be merged with the shrink-wrapping tests
 ; because the booleans set on the command line take precedence on
 ; the target logic that disable shrink-wrapping.
@@ -13,6 +7,12 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "x86_64-apple-macosx"
 
 
+; This test checks that we do not use shrink-wrapping when
+; the function does not have any frame pointer and may unwind.
+; This is a workaround for a limitation in the emission of
+; the CFI directives, that are not correct in such case.
+; PR25614
+;
 ; No shrink-wrapping should occur here, until the CFI information are fixed.
 ; CHECK-LABEL: framelessUnwind:
 ;
@@ -151,3 +151,74 @@ false:
 }
 
 attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
+
+
+; Check that we generate correct code for segmented stack.
+; We used to emit the code at the entry point of the function
+; instead of just before the prologue.
+; For now, shrink-wrapping is disabled on segmented stack functions: PR26107.
+;
+; CHECK-LABEL: segmentedStack:
+; CHECK: cmpq
+; CHECK-NEXT: ja [[ENTRY_LABEL:LBB[0-9_]+]]
+;
+; CHECK: callq ___morestack
+; CHECK-NEXT: retq
+;
+; CHECK: [[ENTRY_LABEL]]:
+; Prologue
+; CHECK: push
+;
+; In PR26107, we use to drop these two basic blocks, because
+; the segmentedStack entry block was jumping directly to
+; the place where the prologue is actually needed, which is
+; the call to memcmp.
+; Then, those two basic blocks did not have any predecessors
+; anymore and were removed.
+;
+; Check if vk1 is null
+; CHECK: testq %rdi, %rdi
+; CHECK-NEXT: je [[STRINGS_EQUAL:LBB[0-9_]+]]
+;
+; Check if vk2 is null
+; CHECK: testq %rsi, %rsi
+; CHECK-NEXT:  je [[STRINGS_EQUAL]]
+;
+; CHECK: [[STRINGS_EQUAL]]
+; CHECK-NEXT: popq
+define zeroext i1 @segmentedStack(i8* readonly %vk1, i8* readonly %vk2, i64 %key_size) #5 {
+entry:
+  %cmp.i = icmp eq i8* %vk1, null
+  %cmp1.i = icmp eq i8* %vk2, null
+  %brmerge.i = or i1 %cmp.i, %cmp1.i
+  %cmp1.mux.i = and i1 %cmp.i, %cmp1.i
+  br i1 %brmerge.i, label %__go_ptr_strings_equal.exit, label %if.end4.i
+
+if.end4.i:                                        ; preds = %entry
+  %tmp = getelementptr inbounds i8, i8* %vk1, i64 8
+  %tmp1 = bitcast i8* %tmp to i64*
+  %tmp2 = load i64, i64* %tmp1, align 8
+  %tmp3 = getelementptr inbounds i8, i8* %vk2, i64 8
+  %tmp4 = bitcast i8* %tmp3 to i64*
+  %tmp5 = load i64, i64* %tmp4, align 8
+  %cmp.i.i = icmp eq i64 %tmp2, %tmp5
+  br i1 %cmp.i.i, label %land.rhs.i.i, label %__go_ptr_strings_equal.exit
+
+land.rhs.i.i:                                     ; preds = %if.end4.i
+  %tmp6 = bitcast i8* %vk2 to i8**
+  %tmp7 = load i8*, i8** %tmp6, align 8
+  %tmp8 = bitcast i8* %vk1 to i8**
+  %tmp9 = load i8*, i8** %tmp8, align 8
+  %call.i.i = tail call i32 @memcmp(i8* %tmp9, i8* %tmp7, i64 %tmp2) #5
+  %cmp4.i.i = icmp eq i32 %call.i.i, 0
+  br label %__go_ptr_strings_equal.exit
+
+__go_ptr_strings_equal.exit:                      ; preds = %land.rhs.i.i, %if.end4.i, %entry
+  %retval.0.i = phi i1 [ %cmp1.mux.i, %entry ], [ false, %if.end4.i ], [ %cmp4.i.i, %land.rhs.i.i ]
+  ret i1 %retval.0.i
+}
+
+; Function Attrs: nounwind readonly
+declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) #5
+
+attributes #5 = { nounwind readonly ssp uwtable "split-stack" }
diff --git a/test/DebugInfo/ARM/PR26163.ll b/test/DebugInfo/ARM/PR26163.ll
new file mode 100644
index 0000000..9ab0e35
--- /dev/null
+++ b/test/DebugInfo/ARM/PR26163.ll
@@ -0,0 +1,107 @@
+; RUN: llc -filetype=obj -o - < %s | llvm-dwarfdump - | FileCheck %s
+;
+; Checks that we're creating two ranges, one that terminates immediately
+; and one that spans the rest of the function. This isn't necessarily the
+; best thing to do here (and also not necessarily correct, since the first
+; one has a bit_piece), but it is what is currently being emitted, any
+; change here needs to be intentional, so the test is very specific.
+;
+; CHECK: .debug_loc contents:
+; CHECK: 0x00000000: Beginning address offset: 0x0000000000000004
+; CHECK:                Ending address offset: 0x0000000000000004
+; CHECK:                 Location description: 10 00 9f
+; CHECK:             Beginning address offset: 0x0000000000000004
+; CHECK:                Ending address offset: 0x0000000000000014
+; CHECK:                 Location description: 10 00 9f
+
+; Created form the following test case (PR26163) with
+; clang -cc1 -triple armv4t--freebsd11.0-gnueabi -emit-obj -debug-info-kind=standalone -O2 -x c test.c
+;
+; typedef	unsigned int	size_t;
+; struct timeval {
+; 	long long tv_sec;
+; 	int tv_usec;
+; };
+; 
+; void *memset(void *, int, size_t);
+; void foo(void);
+; 
+; static void
+; bar(int value)
+; {
+; 	struct timeval lifetime;
+; 
+; 	memset(&lifetime, 0, sizeof(struct timeval));
+; 	lifetime.tv_sec = value;
+; 
+; 	foo();
+; }
+; 
+; int
+; parse_config_file(void)
+; {
+; 	int value;
+; 
+; 	bar(value);
+; 	return (0);
+; }
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv4t--freebsd11.0-gnueabi"
+
+%struct.timeval = type { i64, i32 }
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+declare void @foo()
+
+define i32 @parse_config_file() !dbg !4 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !26), !dbg !27
+  tail call void @llvm.dbg.declare(metadata %struct.timeval* undef, metadata !16, metadata !26), !dbg !29
+  tail call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !16, metadata !30), !dbg !29
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !31), !dbg !29
+  tail call void @foo() #3, !dbg !32
+  ret i32 0, !dbg !33
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22, !23, !24}
+!llvm.ident = !{!25}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (https://github.com/llvm-mirror/clang 89dda3855cda574f355e6defa1d77bdae5053994) (llvm/trunk 257891)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!1 = !DIFile(filename: "<stdin>", directory: "/home/ubuntu/bugs")
+!2 = !{}
+!3 = !{!4, !11}
+!4 = distinct !DISubprogram(name: "parse_config_file", scope: !5, file: !5, line: 22, type: !6, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: true, variables: !9)
+!5 = !DIFile(filename: "test.c", directory: "/home/ubuntu/bugs")
+!6 = !DISubroutineType(types: !7)
+!7 = !{!8}
+!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !{!10}
+!10 = !DILocalVariable(name: "value", scope: !4, file: !5, line: 24, type: !8)
+!11 = distinct !DISubprogram(name: "bar", scope: !5, file: !5, line: 11, type: !12, isLocal: true, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, variables: !14)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !8}
+!14 = !{!15, !16}
+!15 = !DILocalVariable(name: "value", arg: 1, scope: !11, file: !5, line: 11, type: !8)
+!16 = !DILocalVariable(name: "lifetime", scope: !11, file: !5, line: 13, type: !17)
+!17 = !DICompositeType(tag: DW_TAG_structure_type, name: "timeval", file: !5, line: 2, size: 128, align: 64, elements: !18)
+!18 = !{!19, !21}
+!19 = !DIDerivedType(tag: DW_TAG_member, name: "tv_sec", scope: !17, file: !5, line: 3, baseType: !20, size: 64, align: 64)
+!20 = !DIBasicType(name: "long long int", size: 64, align: 64, encoding: DW_ATE_signed)
+!21 = !DIDerivedType(tag: DW_TAG_member, name: "tv_usec", scope: !17, file: !5, line: 4, baseType: !8, size: 32, align: 32, offset: 64)
+!22 = !{i32 2, !"Debug Info Version", i32 3}
+!23 = !{i32 1, !"wchar_size", i32 4}
+!24 = !{i32 1, !"min_enum_size", i32 4}
+!25 = !{!"clang version 3.9.0 (https://github.com/llvm-mirror/clang 89dda3855cda574f355e6defa1d77bdae5053994) (llvm/trunk 257891)"}
+!26 = !DIExpression()
+!27 = !DILocation(line: 11, scope: !11, inlinedAt: !28)
+!28 = distinct !DILocation(line: 26, scope: !4)
+!29 = !DILocation(line: 13, scope: !11, inlinedAt: !28)
+!30 = !DIExpression(DW_OP_bit_piece, 0, 64)
+!31 = !DIExpression(DW_OP_bit_piece, 0, 32)
+!32 = !DILocation(line: 18, scope: !11, inlinedAt: !28)
+!33 = !DILocation(line: 27, scope: !4)
diff --git a/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
index 7df88b1..b91a043 100644
--- a/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
+++ b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 declare i32 @FB()
 
diff --git a/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
index d35418b..94938a8 100644
--- a/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
+++ b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 declare i32 @FB()
 
diff --git a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
index 0d1a1ec..72449f3 100644
--- a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 define i32 @bar() nounwind {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
index 31ed752..31271b5 100644
--- a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 ; This test should fail until remote symbol resolution is supported.
 
 define i32 @main() nounwind {
diff --git a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
index bbeab10..9d1abbc 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
index 0aa19b2..afa8a95 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
@@ -1,5 +1,5 @@
 ; RUN:  %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 ; Check that a variable is always aligned as specified.
 
diff --git a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
index 13bac29..f996159 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 define double @test(double* %DP, double %Arg) nounwind {
 	%D = load double, double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
index 5d5480e..329dc5c 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
index ef74fa0..44557ea 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
@@ -1,6 +1,6 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \
 ; RUN:   -relocation-model=pic -code-model=small %s > /dev/null
-; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32
+; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
index c2260fc..a249c2f 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
index 2a45472..2817053 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
@@ -1,6 +1,6 @@
 ; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \
 ; RUN:   -O0 -relocation-model=pic -code-model=small %s
-; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32
+; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll b/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll
index 249aad2..6fbb2bc 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/cross-module-a.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 declare i32 @FB()
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll b/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll
index 32c58ee..ce09417 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/multi-module-a.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 declare i32 @FB()
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll
index aaf3ebc..bc477c2 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/simpletest-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 define i32 @bar() nounwind {
 	ret i32 0
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll
index a0d9410..001a617 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/stubs-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 ; This test should fail until remote symbol resolution is supported.
 
 define i32 @main() nounwind {
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll
index 9b4e246..4c4256e 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-common-symbols-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll
index 88a561b..1621501 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-data-align-remote.ll
@@ -1,5 +1,5 @@
 ; RUN:  %lli -jit-kind=orc-mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 ; Check that a variable is always aligned as specified.
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll
index 484541a..6ff8704 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-fp-no-external-funcs-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 define double @test(double* %DP, double %Arg) nounwind {
 	%D = load double, double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll
index adc3e94..a7c8bfe 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll
index 8ab3fd5..a028df6 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-global-init-nonzero-sm-pic.ll
@@ -1,6 +1,6 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \
 ; RUN:   -relocation-model=pic -code-model=small %s > /dev/null
-; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32
+; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll
index a47c801..d369d2b 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-remote.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
-; XFAIL: win32
+; XFAIL: mingw32,win32
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll
index 210ac6f..e918dab 100644
--- a/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll
+++ b/test/ExecutionEngine/OrcMCJIT/remote/test-ptr-reloc-sm-pic.ll
@@ -1,6 +1,6 @@
 ; RUN: %lli -jit-kind=orc-mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext \
 ; RUN:   -O0 -relocation-model=pic -code-model=small %s
-; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, win32
+; XFAIL: mips-, mipsel-, aarch64, arm, i686, i386, mingw32, win32
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/MC/AArch64/inst-directive.s b/test/MC/AArch64/inst-directive.s
index 3bb620f..7fd5200 100644
--- a/test/MC/AArch64/inst-directive.s
+++ b/test/MC/AArch64/inst-directive.s
@@ -1,7 +1,14 @@
 // RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=asm -o - \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ASM
-// RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=obj -o - \
-// RUN:   | llvm-readobj -s -sd | FileCheck %s  --check-prefix=CHECK-OBJ
+// RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=obj -o %t
+// RUN: llvm-readobj -s -sd %t | FileCheck %s  --check-prefix=CHECK-OBJ
+// RUN: llvm-objdump -t %t | FileCheck %s  --check-prefix=CHECK-SYMS
+
+// RUN: llvm-mc %s -triple=aarch64_be-none-linux-gnu -filetype=asm -o - \
+// RUN:   | FileCheck %s --check-prefix=CHECK-ASM
+// RUN: llvm-mc %s -triple=aarch64_be-none-linux-gnu -filetype=obj -o %t
+// RUN: llvm-readobj -s -sd %t | FileCheck %s  --check-prefix=CHECK-OBJ
+// RUN: llvm-objdump -t %t | FileCheck %s  --check-prefix=CHECK-SYMS
 
     .section    .inst.aarch64_inst
 
@@ -22,3 +29,7 @@ aarch64_inst:
 // CHECK-OBJ:   SectionData (
 // CHECK-OBJ-NEXT: 0000: 2040105E
 // CHECK-OBJ-NEXT: )
+
+// CHECK-SYMS-NOT: 0000000000000000         .inst.aarch64_inst              00000000 $d
+// CHECK-SYMS:     0000000000000000         .inst.aarch64_inst              00000000 $x
+// CHECK-SYMS-NOT: 0000000000000000         .inst.aarch64_inst              00000000 $d
diff --git a/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll b/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
new file mode 100644
index 0000000..36440da
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -loop-unroll -codegenprepare < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+; CHECK-LABEL: @f
+define i32 @f(i32 %a) #0 {
+; CHECK: call i32 @llvm.bitreverse.i32
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %or
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %b.07 = phi i32 [ 0, %entry ], [ %or, %for.body ]
+  %shr = lshr i32 %a, %i.08
+  %and = and i32 %shr, 1
+  %sub = sub nuw nsw i32 31, %i.08
+  %shl = shl i32 %and, %sub
+  %or = or i32 %shl, %b.07
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
+}
+
+attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git b7441a0f42c43a8eea9e3e706be187252db747fa)"}
+!3 = distinct !{!3, !4}
+!4 = !{!"llvm.loop.unroll.full"}
diff --git a/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg b/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg
new file mode 100644
index 0000000..98c6700
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/CodeGenPrepare/bitreverse-hang.ll b/test/Transforms/CodeGenPrepare/bitreverse-hang.ll
new file mode 100644
index 0000000..c81dcc1
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/bitreverse-hang.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -loop-unroll -codegenprepare -S | FileCheck %s
+
+; This test is a worst-case scenario for bitreversal/byteswap detection.
+; After loop unrolling (the unrolled loop is unreadably large so it has been kept
+; rolled here), we have a binary tree of OR operands (as bitreversal detection
+; looks straight through shifts):
+;
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;
+; This results in exponential runtime. The loop here is 32 iterations which will
+; totally hang if we don't deal with this case cleverly.
+
+@b = common global i32 0, align 4
+
+; CHECK: define i32 @fn1
+define i32 @fn1() #0 {
+entry:
+  %b.promoted = load i32, i32* @b, align 4, !tbaa !2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %or4 = phi i32 [ %b.promoted, %entry ], [ %or, %for.body ]
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %shr = lshr i32 %or4, 1
+  %or = or i32 %shr, %or4
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store i32 %or, i32* @b, align 4, !tbaa !2
+  ret i32 undef
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git eb70f4e9cc9a4dc3dd57b032fb858d56b4b64a0e)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/Inline/inline-funclets.ll b/test/Transforms/Inline/inline-funclets.ll
new file mode 100644
index 0000000..362e03d
--- /dev/null
+++ b/test/Transforms/Inline/inline-funclets.ll
@@ -0,0 +1,455 @@
+; RUN: opt -inline -S %s | FileCheck %s
+
+declare void @g()
+
+
+;;; Test with a call in a funclet that needs to remain a call
+;;; when inlined because the funclet doesn't unwind to caller.
+;;; CHECK-LABEL: define void @test1(
+define void @test1() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test1_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test1_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %exit unwind label %cleanup.inner
+; CHECK-NEXT:  invoke void @g()
+; CHECK-NEXT:    unwind label %[[cleanup_inner:.+]]
+
+cleanup.inner:
+  %pad.inner = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad.inner) ]
+  cleanupret from %pad.inner unwind label %cleanup.outer
+; CHECK: [[cleanup_inner]]:
+; The call here needs to remain a call becuase pad.inner has a cleanupret
+; that stays within the inlinee.
+; CHECK-NEXT:  %[[pad_inner:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT:  call void @g() [ "funclet"(token %[[pad_inner]]) ]
+; CHECK-NEXT:  cleanupret from %[[pad_inner]] unwind label %[[cleanup_outer:.+]]
+
+cleanup.outer:
+  %pad.outer = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad.outer) ]
+  cleanupret from %pad.outer unwind to caller
+; CHECK: [[cleanup_outer]]:
+; The call and cleanupret here need to be redirected to caller cleanup
+; CHECK-NEXT: %[[pad_outer:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[pad_outer]]) ]
+; CHECK-NEXT:   unwind label %cleanup
+; CHECK: cleanupret from %[[pad_outer]] unwind label %cleanup{{$}}
+
+exit:
+  ret void
+}
+
+
+
+;;; Test with an "unwind to caller" catchswitch in a parent funclet
+;;; that needs to remain "unwind to caller" because the parent
+;;; doesn't unwind to caller.
+;;; CHECK-LABEL: define void @test2(
+define void @test2() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test2_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test2_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %exit unwind label %cleanup1
+; CHECK-NEXT:   invoke void @g()
+; CHECK-NEXT:     unwind label %[[cleanup1:.+]]
+
+cleanup1:
+  %outer = cleanuppad within none []
+  invoke void @g() [ "funclet"(token %outer) ]
+    to label %ret1 unwind label %catchswitch
+; CHECK: [[cleanup1]]:
+; CHECK-NEXT: %[[outer:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[outer]]) ]
+; CHECK-NEXT:   unwind label %[[catchswitch:.+]]
+
+catchswitch:
+  %cs = catchswitch within %outer [label %catch] unwind to caller
+; CHECK: [[catchswitch]]:
+; The catchswitch here needs to remain "unwind to caller" since %outer
+; has a cleanupret that remains within the inlinee.
+; CHECK-NEXT: %[[cs:[^ ]+]] = catchswitch within %[[outer]] [label %[[catch:.+]]] unwind to caller
+
+catch:
+  %inner = catchpad within %cs []
+  call void @g() [ "funclet"(token %inner) ]
+  catchret from %inner to label %ret1
+; CHECK: [[catch]]:
+; The call here needs to remain a call since it too is within %outer
+; CHECK:   %[[inner:[^ ]+]] = catchpad within %[[cs]]
+; CHECK-NEXT: call void @g() [ "funclet"(token %[[inner]]) ]
+
+ret1:
+  cleanupret from %outer unwind label %cleanup2
+; CHECK: cleanupret from %[[outer]] unwind label %[[cleanup2:.+]]
+
+cleanup2:
+  %later = cleanuppad within none []
+  cleanupret from %later unwind to caller
+; CHECK: [[cleanup2]]:
+; The cleanupret here needs to get redirected to the caller cleanup
+; CHECK-NEXT: %[[later:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: cleanupret from %[[later]] unwind label %cleanup{{$}}
+
+exit:
+  ret void
+}
+
+
+;;; Test with a call in a cleanup that has no definitive unwind
+;;; destination, that must be rewritten to an invoke.
+;;; CHECK-LABEL: define void @test3(
+define void @test3() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test3_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test3_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %exit unwind label %cleanup
+; CHECK-NEXT:  invoke void @g()
+; CHECK-NEXT:    unwind label %[[cleanup:.+]]
+
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  unreachable
+; CHECK: [[cleanup]]:
+; The call must be rewritten to an invoke targeting the caller cleanup
+; because it may well unwind to there.
+; CHECK-NEXT: %[[pad:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[pad]]) ]
+; CHECK-NEXT:   unwind label %cleanup{{$}}
+
+exit:
+  ret void
+}
+
+
+;;; Test with a catchswitch in a cleanup that has no definitive
+;;; unwind destination, that must be rewritten to unwind to the
+;;; inlined invoke's unwind dest
+;;; CHECK-LABEL: define void @test4(
+define void @test4() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test4_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test4_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %exit unwind label %cleanup
+; CHECK-NEXT: invoke void @g()
+; CHECK-NEXT:   unwind label %[[cleanup:.+]]
+
+cleanup:
+  %clean = cleanuppad within none []
+  invoke void @g() [ "funclet"(token %clean) ]
+    to label %unreachable unwind label %dispatch
+; CHECK: [[cleanup]]:
+; CHECK-NEXT: %[[clean:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[clean]]) ]
+; CHECK-NEXT:   unwind label %[[dispatch:.+]]
+
+dispatch:
+  %cs = catchswitch within %clean [label %catch] unwind to caller
+; CHECK: [[dispatch]]:
+; The catchswitch must be rewritten to unwind to %cleanup in the caller
+; because it may well unwind to there.
+; CHECK-NEXT: %[[cs:[^ ]+]] = catchswitch within %[[clean]] [label %[[catch:.+]]] unwind label %cleanup{{$}}
+
+catch:
+  catchpad within %cs []
+  br label %unreachable
+unreachable:
+  unreachable
+exit:
+  ret void
+}
+
+
+;;; Test with multiple levels of nesting, and unwind dests
+;;; that need to be inferred from ancestors, descendants,
+;;; and cousins.
+;;; CHECK-LABEL: define void @test5(
+define void @test5() personality void ()* @g {
+entry:
+; CHECK-NEXT: entry:
+  invoke void @test5_inlinee()
+    to label %exit unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test5_inlinee() alwaysinline personality void ()* @g {
+entry:
+  invoke void @g()
+    to label %cont unwind label %noinfo.root
+; CHECK-NEXT: invoke void @g()
+; CHECK-NEXT:   to label %[[cont:[^ ]+]] unwind label %[[noinfo_root:.+]]
+
+noinfo.root:
+  %noinfo.root.pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %noinfo.root.pad) ]
+  invoke void @g() [ "funclet"(token %noinfo.root.pad) ]
+    to label %noinfo.root.cont unwind label %noinfo.left
+; CHECK: [[noinfo_root]]:
+; Nothing under "noinfo.root" has a definitive unwind destination, so
+; we must assume all of it may actually unwind, and redirect unwinds
+; to the cleanup in the caller.
+; CHECK-NEXT: %[[noinfo_root_pad:[^ ]+]] = cleanuppad within none []
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_root_pad]]) ]
+; CHECK-NEXT:   to label %[[next:[^ ]+]] unwind label %cleanup{{$}}
+; CHECK: [[next]]:
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_root_pad]]) ]
+; CHECK-NEXT:   to label %[[noinfo_root_cont:[^ ]+]] unwind label %[[noinfo_left:.+]]
+
+noinfo.left:
+  %noinfo.left.pad = cleanuppad within %noinfo.root.pad []
+  invoke void @g() [ "funclet"(token %noinfo.left.pad) ]
+    to label %unreachable unwind label %noinfo.left.child
+; CHECK: [[noinfo_left]]:
+; CHECK-NEXT: %[[noinfo_left_pad:[^ ]+]] = cleanuppad within %[[noinfo_root_pad]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_left_pad]]) ]
+; CHECK-NEXT:   unwind label %[[noinfo_left_child:.+]]
+
+noinfo.left.child:
+  %noinfo.left.child.cs = catchswitch within %noinfo.left.pad [label %noinfo.left.child.catch] unwind to caller
+; CHECK: [[noinfo_left_child]]:
+; CHECK-NEXT: %[[noinfo_left_child_cs:[^ ]+]] = catchswitch within %[[noinfo_left_pad]] [label %[[noinfo_left_child_catch:[^ ]+]]] unwind label %cleanup{{$}}
+
+noinfo.left.child.catch:
+  %noinfo.left.child.pad = catchpad within %noinfo.left.child.cs []
+  call void @g() [ "funclet"(token %noinfo.left.child.pad) ]
+  br label %unreachable
+; CHECK: [[noinfo_left_child_catch]]:
+; CHECK-NEXT: %[[noinfo_left_child_pad:[^ ]+]] = catchpad within %[[noinfo_left_child_cs]] []
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_left_child_pad]]) ]
+; CHECK-NEXT:   unwind label %cleanup{{$}}
+
+noinfo.root.cont:
+  invoke void @g() [ "funclet"(token %noinfo.root.pad) ]
+    to label %unreachable unwind label %noinfo.right
+; CHECK: [[noinfo_root_cont]]:
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_root_pad]]) ]
+; CHECK-NEXT:   unwind label %[[noinfo_right:.+]]
+
+noinfo.right:
+  %noinfo.right.cs = catchswitch within %noinfo.root.pad [label %noinfo.right.catch] unwind to caller
+; CHECK: [[noinfo_right]]:
+; CHECK-NEXT: %[[noinfo_right_cs:[^ ]+]] = catchswitch within %[[noinfo_root_pad]] [label %[[noinfo_right_catch:[^ ]+]]] unwind label %cleanup{{$}}
+
+noinfo.right.catch:
+  %noinfo.right.pad = catchpad within %noinfo.right.cs []
+  invoke void @g() [ "funclet"(token %noinfo.right.pad) ]
+    to label %unreachable unwind label %noinfo.right.child
+; CHECK: [[noinfo_right_catch]]:
+; CHECK-NEXT: %[[noinfo_right_pad:[^ ]+]] = catchpad within %[[noinfo_right_cs]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_right_pad]]) ]
+; CHECK-NEXT:   unwind label %[[noinfo_right_child:.+]]
+
+noinfo.right.child:
+  %noinfo.right.child.pad = cleanuppad within %noinfo.right.pad []
+  call void @g() [ "funclet"(token %noinfo.right.child.pad) ]
+  br label %unreachable
+; CHECK: [[noinfo_right_child]]:
+; CHECK-NEXT: %[[noinfo_right_child_pad:[^ ]+]] = cleanuppad within %[[noinfo_right_pad]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[noinfo_right_child_pad]]) ]
+; CHECK-NEXT:   unwind label %cleanup{{$}}
+
+cont:
+  invoke void @g()
+    to label %exit unwind label %implicit.root
+; CHECK: [[cont]]:
+; CHECK-NEXT: invoke void @g()
+; CHECK-NEXT:   unwind label %[[implicit_root:.+]]
+
+implicit.root:
+  %implicit.root.pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %implicit.root.pad) ]
+  invoke void @g() [ "funclet"(token %implicit.root.pad) ]
+    to label %implicit.root.cont unwind label %implicit.left
+; CHECK: [[implicit_root]]:
+; There's an unwind edge to %internal in implicit.right, and we need to propagate that
+; fact down to implicit.right.grandchild, up to implicit.root, and down to
+; implicit.left.child.catch, leaving all calls and "unwind to caller" catchswitches
+; alone to so they don't conflict with the unwind edge in implicit.right
+; CHECK-NEXT: %[[implicit_root_pad:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: call void @g() [ "funclet"(token %[[implicit_root_pad]]) ]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_root_pad]]) ]
+; CHECK-NEXT:   to label %[[implicit_root_cont:[^ ]+]] unwind label %[[implicit_left:.+]]
+
+implicit.left:
+  %implicit.left.pad = cleanuppad within %implicit.root.pad []
+  invoke void @g() [ "funclet"(token %implicit.left.pad) ]
+    to label %unreachable unwind label %implicit.left.child
+; CHECK: [[implicit_left]]:
+; CHECK-NEXT: %[[implicit_left_pad:[^ ]+]] = cleanuppad within %[[implicit_root_pad:[^ ]+]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_left_pad]]) ]
+; CHECK-NEXT:   unwind label %[[implicit_left_child:.+]]
+
+implicit.left.child:
+  %implicit.left.child.cs = catchswitch within %implicit.left.pad [label %implicit.left.child.catch] unwind to caller
+; CHECK: [[implicit_left_child]]:
+; CHECK-NEXT: %[[implicit_left_child_cs:[^ ]+]] = catchswitch within %[[implicit_left_pad]] [label %[[implicit_left_child_catch:[^ ]+]]] unwind to caller
+
+implicit.left.child.catch:
+  %implicit.left.child.pad = catchpad within %implicit.left.child.cs []
+  call void @g() [ "funclet"(token %implicit.left.child.pad) ]
+  br label %unreachable
+; CHECK: [[implicit_left_child_catch]]:
+; CHECK-NEXT: %[[implicit_left_child_pad:[^ ]+]] = catchpad within %[[implicit_left_child_cs]]
+; CHECK-NEXT: call void @g() [ "funclet"(token %[[implicit_left_child_pad]]) ]
+
+implicit.root.cont:
+  invoke void @g() [ "funclet"(token %implicit.root.pad) ]
+    to label %unreachable unwind label %implicit.right
+; CHECK: [[implicit_root_cont]]:
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_root_pad]]) ]
+; CHECK-NEXT:   unwind label %[[implicit_right:.+]]
+
+implicit.right:
+  %implicit.right.cs = catchswitch within %implicit.root.pad [label %implicit.right.catch] unwind label %internal
+; CHECK: [[implicit_right]]:
+; This is the unwind edge (to %internal) whose existence needs to get propagated around the "implicit" tree
+; CHECK-NEXT: %[[implicit_right_cs:[^ ]+]] = catchswitch within %[[implicit_root_pad]] [label %[[implicit_right_catch:[^ ]+]]] unwind label %[[internal:.+]]
+
+implicit.right.catch:
+  %implicit.right.pad = catchpad within %implicit.right.cs []
+  invoke void @g() [ "funclet"(token %implicit.right.pad) ]
+    to label %unreachable unwind label %implicit.right.child
+; CHECK: [[implicit_right_catch]]:
+; CHECK-NEXT: %[[implicit_right_pad:[^ ]+]] = catchpad within %[[implicit_right_cs]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_right_pad]]) ]
+; CHECK-NEXT:   unwind label %[[implicit_right_child:.+]]
+
+implicit.right.child:
+  %implicit.right.child.pad = cleanuppad within %implicit.right.pad []
+  invoke void @g() [ "funclet"(token %implicit.right.child.pad) ]
+    to label %unreachable unwind label %implicit.right.grandchild
+; CHECK: [[implicit_right_child]]:
+; CHECK-NEXT: %[[implicit_right_child_pad:[^ ]+]] = cleanuppad within %[[implicit_right_pad]]
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[implicit_right_child_pad]]) ]
+; CHECK-NEXT:   unwind label %[[implicit_right_grandchild:.+]]
+
+implicit.right.grandchild:
+  %implicit.right.grandchild.cs = catchswitch within %implicit.right.child.pad [label %implicit.right.grandchild.catch] unwind to caller
+; CHECK: [[implicit_right_grandchild]]:
+; CHECK-NEXT: %[[implicit_right_grandchild_cs:[^ ]+]] = catchswitch within %[[implicit_right_child_pad]] [label %[[implicit_right_grandchild_catch:[^ ]+]]] unwind to caller
+
+implicit.right.grandchild.catch:
+  %implicit.right.grandhcild.pad = catchpad within %implicit.right.grandchild.cs []
+  call void @g() [ "funclet"(token %implicit.right.grandhcild.pad) ]
+  br label %unreachable
+; CHECK: [[implicit_right_grandchild_catch]]:
+; CHECK-NEXT: %[[implicit_right_grandhcild_pad:[^ ]+]] = catchpad within %[[implicit_right_grandchild_cs]]
+; CHECK-NEXT: call void @g() [ "funclet"(token %[[implicit_right_grandhcild_pad]]) ]
+
+internal:
+  %internal.pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %internal.pad) ]
+  cleanupret from %internal.pad unwind to caller
+; CHECK: [[internal]]:
+; internal is a cleanup with a "return to caller" cleanuppad; that needs to get redirected
+; to %cleanup in the caller, and the call needs to get similarly rewritten to an invoke.
+; CHECK-NEXT: %[[internal_pad:[^ ]+]] = cleanuppad within none
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %internal.pad.i) ]
+; CHECK-NEXT:   to label %[[next:[^ ]+]] unwind label %cleanup{{$}}
+; CHECK: [[next]]:
+; CHECK-NEXT: cleanupret from %[[internal_pad]] unwind label %cleanup{{$}}
+
+unreachable:
+  unreachable
+exit:
+  ret void
+}
+
+
+declare void @ProcessCLRException()
+
+; Make sure the logic doesn't get tripped up when the inlined invoke is
+; itself within a funclet in the caller.
+; CHECK-LABEL: define void @test6(
+define void @test6() personality void ()* @ProcessCLRException {
+entry:
+  invoke void @g()
+    to label %exit unwind label %callsite_parent
+callsite_parent:
+  %callsite_parent.pad = cleanuppad within none []
+; CHECK: %callsite_parent.pad = cleanuppad within none
+  invoke void @test6_inlinee() [ "funclet"(token %callsite_parent.pad) ]
+    to label %ret unwind label %cleanup
+ret:
+  cleanupret from %callsite_parent.pad unwind label %cleanup
+cleanup:
+  %pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %pad) ]
+  cleanupret from %pad unwind to caller
+exit:
+  ret void
+}
+
+define void @test6_inlinee() alwaysinline personality void ()* @ProcessCLRException {
+entry:
+  invoke void @g()
+    to label %exit unwind label %inlinee_cleanup
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %callsite_parent.pad) ]
+; CHECK-NEXT:   unwind label %[[inlinee_cleanup:.+]]
+
+inlinee_cleanup:
+  %inlinee.pad = cleanuppad within none []
+  call void @g() [ "funclet"(token %inlinee.pad) ]
+  unreachable
+; CHECK: [[inlinee_cleanup]]:
+; CHECK-NEXT: %[[inlinee_pad:[^ ]+]] = cleanuppad within %callsite_parent.pad
+; CHECK-NEXT: invoke void @g() [ "funclet"(token %[[inlinee_pad]]) ]
+; CHECK-NEXT:   unwind label %cleanup{{$}}
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/InstCombine/bitreverse-hang.ll b/test/Transforms/InstCombine/bitreverse-hang.ll
new file mode 100644
index 0000000..6823bd0
--- /dev/null
+++ b/test/Transforms/InstCombine/bitreverse-hang.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -loop-unroll -instcombine -S | FileCheck %s
+
+; This test is a worst-case scenario for bitreversal/byteswap detection.
+; After loop unrolling (the unrolled loop is unreadably large so it has been kept
+; rolled here), we have a binary tree of OR operands (as bitreversal detection
+; looks straight through shifts):
+;
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;  | \
+;  |  LSHR
+;  | /
+;  OR
+;
+; This results in exponential runtime. The loop here is 32 iterations which will
+; totally hang if we don't deal with this case cleverly.
+
+@b = common global i32 0, align 4
+
+; CHECK: define i32 @fn1
+define i32 @fn1() #0 {
+entry:
+  %b.promoted = load i32, i32* @b, align 4, !tbaa !2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %or4 = phi i32 [ %b.promoted, %entry ], [ %or, %for.body ]
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %shr = lshr i32 %or4, 1
+  %or = or i32 %shr, %or4
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store i32 %or, i32* @b, align 4, !tbaa !2
+  ret i32 undef
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 3.8.0 (http://llvm.org/git/clang.git eb70f4e9cc9a4dc3dd57b032fb858d56b4b64a0e)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/InstCombine/bitreverse-recognize.ll b/test/Transforms/InstCombine/bitreverse-recognize.ll
deleted file mode 100644
index fbd5cb6..0000000
--- a/test/Transforms/InstCombine/bitreverse-recognize.ll
+++ /dev/null
@@ -1,114 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-define zeroext i8 @f_u8(i8 zeroext %a) {
-; CHECK-LABEL: @f_u8
-; CHECK-NEXT: %[[A:.*]] = call i8 @llvm.bitreverse.i8(i8 %a)
-; CHECK-NEXT: ret i8 %[[A]]
-  %1 = shl i8 %a, 7
-  %2 = shl i8 %a, 5
-  %3 = and i8 %2, 64
-  %4 = shl i8 %a, 3
-  %5 = and i8 %4, 32
-  %6 = shl i8 %a, 1
-  %7 = and i8 %6, 16
-  %8 = lshr i8 %a, 1
-  %9 = and i8 %8, 8
-  %10 = lshr i8 %a, 3
-  %11 = and i8 %10, 4
-  %12 = lshr i8 %a, 5
-  %13 = and i8 %12, 2
-  %14 = lshr i8 %a, 7
-  %15 = or i8 %14, %1
-  %16 = or i8 %15, %3
-  %17 = or i8 %16, %5
-  %18 = or i8 %17, %7
-  %19 = or i8 %18, %9
-  %20 = or i8 %19, %11
-  %21 = or i8 %20, %13
-  ret i8 %21
-}
-
-; The ANDs with 32 and 64 have been swapped here, so the sequence does not
-; completely match a bitreverse.
-define zeroext i8 @f_u8_fail(i8 zeroext %a) {
-; CHECK-LABEL: @f_u8_fail
-; CHECK-NOT: call
-; CHECK: ret i8
-  %1 = shl i8 %a, 7
-  %2 = shl i8 %a, 5
-  %3 = and i8 %2, 32
-  %4 = shl i8 %a, 3
-  %5 = and i8 %4, 64
-  %6 = shl i8 %a, 1
-  %7 = and i8 %6, 16
-  %8 = lshr i8 %a, 1
-  %9 = and i8 %8, 8
-  %10 = lshr i8 %a, 3
-  %11 = and i8 %10, 4
-  %12 = lshr i8 %a, 5
-  %13 = and i8 %12, 2
-  %14 = lshr i8 %a, 7
-  %15 = or i8 %14, %1
-  %16 = or i8 %15, %3
-  %17 = or i8 %16, %5
-  %18 = or i8 %17, %7
-  %19 = or i8 %18, %9
-  %20 = or i8 %19, %11
-  %21 = or i8 %20, %13
-  ret i8 %21
-}
-
-define zeroext i16 @f_u16(i16 zeroext %a) {
-; CHECK-LABEL: @f_u16
-; CHECK-NEXT: %[[A:.*]] = call i16 @llvm.bitreverse.i16(i16 %a)
-; CHECK-NEXT: ret i16 %[[A]]
-  %1 = shl i16 %a, 15
-  %2 = shl i16 %a, 13
-  %3 = and i16 %2, 16384
-  %4 = shl i16 %a, 11
-  %5 = and i16 %4, 8192
-  %6 = shl i16 %a, 9
-  %7 = and i16 %6, 4096
-  %8 = shl i16 %a, 7
-  %9 = and i16 %8, 2048
-  %10 = shl i16 %a, 5
-  %11 = and i16 %10, 1024
-  %12 = shl i16 %a, 3
-  %13 = and i16 %12, 512
-  %14 = shl i16 %a, 1
-  %15 = and i16 %14, 256
-  %16 = lshr i16 %a, 1
-  %17 = and i16 %16, 128
-  %18 = lshr i16 %a, 3
-  %19 = and i16 %18, 64
-  %20 = lshr i16 %a, 5
-  %21 = and i16 %20, 32
-  %22 = lshr i16 %a, 7
-  %23 = and i16 %22, 16
-  %24 = lshr i16 %a, 9
-  %25 = and i16 %24, 8
-  %26 = lshr i16 %a, 11
-  %27 = and i16 %26, 4
-  %28 = lshr i16 %a, 13
-  %29 = and i16 %28, 2
-  %30 = lshr i16 %a, 15
-  %31 = or i16 %30, %1
-  %32 = or i16 %31, %3
-  %33 = or i16 %32, %5
-  %34 = or i16 %33, %7
-  %35 = or i16 %34, %9
-  %36 = or i16 %35, %11
-  %37 = or i16 %36, %13
-  %38 = or i16 %37, %15
-  %39 = or i16 %38, %17
-  %40 = or i16 %39, %19
-  %41 = or i16 %40, %21
-  %42 = or i16 %41, %23
-  %43 = or i16 %42, %25
-  %44 = or i16 %43, %27
-  %45 = or i16 %44, %29
-  ret i16 %45
-}
\ No newline at end of file
diff --git a/test/Transforms/InstCombine/cos-2.ll b/test/Transforms/InstCombine/cos-2.ll
index c9a9c7c..a85cc8f 100644
--- a/test/Transforms/InstCombine/cos-2.ll
+++ b/test/Transforms/InstCombine/cos-2.ll
@@ -1,12 +1,11 @@
-; Test that the cos library call simplifier works correctly.
-;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 declare float @cos(double)
+declare signext i8 @sqrt(...)
 
-; Check that cos functions with the wrong prototype aren't simplified.
+; Check that functions with the wrong prototype aren't simplified.
 
 define float @test_no_simplify1(double %d) {
 ; CHECK-LABEL: @test_no_simplify1(
@@ -15,3 +14,14 @@ define float @test_no_simplify1(double %d) {
 ; CHECK: call float @cos(double %neg)
   ret float %cos
 }
+
+
+define i8 @bogus_sqrt() {
+  %fake_sqrt = call signext i8 (...) @sqrt()
+  ret i8 %fake_sqrt
+
+; CHECK-LABEL: bogus_sqrt(
+; CHECK-NEXT:  %fake_sqrt = call signext i8 (...) @sqrt()
+; CHECK-NEXT:  ret i8 %fake_sqrt
+}
+
diff --git a/test/Transforms/InstCombine/double-float-shrink-1.ll b/test/Transforms/InstCombine/double-float-shrink-1.ll
index 319ea32..74f3ebb 100644
--- a/test/Transforms/InstCombine/double-float-shrink-1.ll
+++ b/test/Transforms/InstCombine/double-float-shrink-1.ll
@@ -364,6 +364,26 @@ define float @max1(float %a, float %b) {
 ; CHECK-NEXT:  ret
 }
 
+; A function can have a name that matches a common libcall,
+; but with the wrong type(s). Let it be.
+
+define float @fake_fmin(float %a, float %b) {
+  %c = fpext float %a to fp128
+  %d = fpext float %b to fp128
+  %e = call fp128 @fmin(fp128 %c, fp128 %d)
+  %f = fptrunc fp128 %e to float
+  ret float %f
+
+; CHECK-LABEL: fake_fmin(
+; CHECK-NEXT:  %c = fpext float %a to fp128
+; CHECK-NEXT:  %d = fpext float %b to fp128
+; CHECK-NEXT:  %e = call fp128 @fmin(fp128 %c, fp128 %d)
+; CHECK-NEXT:  %f = fptrunc fp128 %e to float
+; CHECK-NEXT:  ret float %f
+}
+
+declare fp128 @fmin(fp128, fp128) ; This is not the 'fmin' you're looking for.
+
 declare double @fmax(double, double)
 
 declare double @tanh(double) #1
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 67e7cbd..a76ec11 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -16,6 +16,7 @@
 #include "OrcLazyJIT.h"
 #include "RemoteJITUtils.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
@@ -741,11 +742,11 @@ std::unique_ptr<FDRPCChannel> launchRemote() {
       ChildPath.reset(new char[ChildExecPath.size() + 1]);
       std::copy(ChildExecPath.begin(), ChildExecPath.end(), &ChildPath[0]);
       ChildPath[ChildExecPath.size()] = '\0';
-      std::string ChildInStr = std::to_string(PipeFD[0][0]);
+      std::string ChildInStr = utostr(PipeFD[0][0]);
       ChildIn.reset(new char[ChildInStr.size() + 1]);
       std::copy(ChildInStr.begin(), ChildInStr.end(), &ChildIn[0]);
       ChildIn[ChildInStr.size()] = '\0';
-      std::string ChildOutStr = std::to_string(PipeFD[1][1]);
+      std::string ChildOutStr = utostr(PipeFD[1][1]);
       ChildOut.reset(new char[ChildOutStr.size() + 1]);
       std::copy(ChildOutStr.begin(), ChildOutStr.end(), &ChildOut[0]);
       ChildOut[ChildOutStr.size()] = '\0';
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index fb50160..c3884ba 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -65,11 +65,6 @@ function usage() {
     echo " -no-openmp           Disable check-out & build libomp"
 }
 
-if [ `uname -s` = "Darwin" ]; then
-  # compiler-rt doesn't yet build with CMake on Darwin.
-  use_autoconf="yes"
-fi
-
 while [ $# -gt 0 ]; do
     case $1 in
         -release | --release )
@@ -288,10 +283,20 @@ function export_sources() {
     if [ ! -h clang ]; then
         ln -s ../../cfe.src clang
     fi
-    cd $BuildDir/llvm.src/tools/clang/tools
-    if [ ! -h extra ]; then
-        ln -s ../../../../clang-tools-extra.src extra
+
+    # The autoconf and CMake builds want different symlinks here:
+    if [ "$use_autoconf" = "yes" ]; then
+      cd $BuildDir/llvm.src/tools/clang/tools
+      if [ ! -h extra ]; then
+          ln -s ../../../../clang-tools-extra.src extra
+      fi
+    else
+      cd $BuildDir/cfe.src/tools
+      if [ ! -h extra ]; then
+          ln -s ../../clang-tools-extra.src extra
+      fi
     fi
+
     cd $BuildDir/llvm.src/projects
     if [ -d $BuildDir/test-suite.src ] && [ ! -h test-suite ]; then
         ln -s ../../test-suite.src test-suite
-- 
cgit v1.1


From 44c4732640f764c943d7814138396141c0f4646b Mon Sep 17 00:00:00 2001
From: dim <dim@FreeBSD.org>
Date: Wed, 27 Jan 2016 21:08:51 +0000
Subject: Vendor import of llvm release_38 branch r258968:
 https://llvm.org/svn/llvm-project/llvm/branches/release_38@258968

---
 include/llvm/Analysis/ScalarEvolution.h          |  6 +-
 lib/Analysis/DemandedBits.cpp                    |  2 +-
 lib/Analysis/ScalarEvolution.cpp                 |  8 +++
 lib/Target/AMDGPU/AMDGPU.td                      |  5 ++
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp            |  2 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.h              |  5 ++
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp        |  2 +
 lib/Target/X86/X86ISelLowering.cpp               |  3 +-
 lib/Transforms/Instrumentation/GCOVProfiling.cpp | 10 ++++
 test/Analysis/DemandedBits/basic.ll              | 13 ++++-
 test/CodeGen/X86/cmovcmov.ll                     | 49 ++++++++++++++++
 test/Transforms/GCOVProfiling/modules.ll         | 12 ++++
 test/Transforms/IndVarSimplify/pr26207.ll        | 20 +++++++
 utils/release/test-release.sh                    | 71 +++++++++---------------
 14 files changed, 157 insertions(+), 51 deletions(-)
 create mode 100644 test/Transforms/GCOVProfiling/modules.ll
 create mode 100644 test/Transforms/IndVarSimplify/pr26207.ll

diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index c08335d..ef93057 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -412,7 +412,11 @@ namespace llvm {
 
       /*implicit*/ ExitLimit(const SCEV *E) : Exact(E), Max(E) {}
 
-      ExitLimit(const SCEV *E, const SCEV *M) : Exact(E), Max(M) {}
+      ExitLimit(const SCEV *E, const SCEV *M) : Exact(E), Max(M) {
+        assert((isa<SCEVCouldNotCompute>(Exact) ||
+                !isa<SCEVCouldNotCompute>(Max)) &&
+               "Exact is not allowed to be less precise than Max");
+      }
 
       /// Test whether this ExitLimit contains any computed information, or
       /// whether it's all SCEVCouldNotCompute values.
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 912c5ce..143d0b7 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -244,7 +244,7 @@ void DemandedBits::determineLiveOperandBits(
     break;
   case Instruction::ICmp:
     // Count the number of leading zeroes in each operand.
-    ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+    ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1));
     auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(),
                                      KnownZero2.countLeadingOnes());
     AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes);
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 34074ef..ef1bb3a 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -5368,6 +5368,14 @@ ScalarEvolution::computeExitLimitFromCond(const Loop *L,
           BECount = EL0.Exact;
       }
 
+      // There are cases (e.g. PR26207) where computeExitLimitFromCond is able
+      // to be more aggressive when computing BECount than when computing
+      // MaxBECount.  In these cases it is possible for EL0.Exact and EL1.Exact
+      // to match, but for EL0.Max and EL1.Max to not.
+      if (isa<SCEVCouldNotCompute>(MaxBECount) &&
+          !isa<SCEVCouldNotCompute>(BECount))
+        MaxBECount = BECount;
+
       return ExitLimit(BECount, MaxBECount);
     }
     if (BO->getOpcode() == Instruction::Or) {
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index db869cf..79c6604 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -138,6 +138,11 @@ def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer",
         "true",
         "Enable scratch buffer sizes greater than 128 GB">;
 
+def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
+        "EnableSIScheduler",
+        "true",
+        "Enable SI Machine Scheduler">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
         "TexVTXClauseSize",
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index c6af5b9..7d70fa7 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -78,7 +78,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
       EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
       GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
       IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
-      FrameLowering(nullptr),
+      EnableSIScheduler(false), FrameLowering(nullptr),
       InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
 
   initializeSubtargetDependencies(TT, GPU, FS);
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index d371227..4796e9e 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -90,6 +90,7 @@ private:
   int LDSBankCount;
   unsigned IsaVersion;
   bool EnableHugeScratchBuffer;
+  bool EnableSIScheduler;
 
   std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
   std::unique_ptr<AMDGPUTargetLowering> TLInfo;
@@ -280,6 +281,10 @@ public:
     return EnableHugeScratchBuffer;
   }
 
+  bool enableSIScheduler() const {
+    return EnableSIScheduler;
+  }
+
   bool dumpCode() const {
     return DumpCode;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b1be619..519ae5c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -147,6 +147,8 @@ public:
     const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       return createR600MachineScheduler(C);
+    else if (ST.enableSIScheduler())
+      return createSIMachineScheduler(C);
     return nullptr;
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6904714..34f3919 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -21880,7 +21880,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
   if (LastCMOV == MI &&
       NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
       NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
-      NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
+      NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg() &&
+      NextMIIt->getOperand(1).isKill()) {
     CascadedCMOV = &*NextMIIt;
   }
 
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index fa939ae..ffde7f8 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -494,6 +494,11 @@ void GCOVProfiler::emitProfileNotes() {
     // LTO, we'll generate the same .gcno files.
 
     auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+
+    // Skip module skeleton (and module) CUs.
+    if (CU->getDWOId())
+      continue;
+
     std::error_code EC;
     raw_fd_ostream out(mangleName(CU, "gcno"), EC, sys::fs::F_None);
     std::string EdgeDestinations;
@@ -853,6 +858,11 @@ Function *GCOVProfiler::insertCounterWriteout(
   if (CU_Nodes) {
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
       auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+
+      // Skip module skeleton (and module) CUs.
+      if (CU->getDWOId())
+        continue;
+
       std::string FilenameGcda = mangleName(CU, "gcda");
       uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
       Builder.CreateCall(StartFile,
diff --git a/test/Analysis/DemandedBits/basic.ll b/test/Analysis/DemandedBits/basic.ll
index 487e522..9973edf 100644
--- a/test/Analysis/DemandedBits/basic.ll
+++ b/test/Analysis/DemandedBits/basic.ll
@@ -24,11 +24,20 @@ define i1 @test_icmp1(i32 %a, i32 %b) {
 
 ; CHECK-LABEL: 'test_icmp2'
 ; CHECK-DAG: DemandedBits: 0x1 for   %3 = icmp eq i32 %1, %2
-; CHECK-DAG: DemandedBits: 0xFF for   %1 = and i32 %a, 255
-; CHECK-DAG: DemandedBits: 0xF for   %2 = ashr i32 %1, 4
+; CHECK-DAG: DemandedBits: 0xFFF for   %1 = and i32 %a, 255
+; CHECK-DAG: DemandedBits: 0xFF for   %2 = ashr i32 %1, 4
 define i1 @test_icmp2(i32 %a, i32 %b) {
   %1 = and i32 %a, 255
   %2 = ashr i32 %1, 4
   %3 = icmp eq i32 %1, %2
   ret i1 %3
 }
+
+; CHECK-LABEL: 'test_icmp3'
+; CHECK-DAG: DemandedBits: 0xFFFFFFFF for   %1 = and i32 %a, 255
+; CHECK-DAG: DemandedBits: 0x1 for   %2 = icmp eq i32 -1, %1
+define i1 @test_icmp3(i32 %a) {
+  %1 = and i32 %a, 255
+  %2 = icmp eq i32 -1, %1
+  ret i1 %2
+}
diff --git a/test/CodeGen/X86/cmovcmov.ll b/test/CodeGen/X86/cmovcmov.ll
index d3d9748..9363d31 100644
--- a/test/CodeGen/X86/cmovcmov.ll
+++ b/test/CodeGen/X86/cmovcmov.ll
@@ -224,3 +224,52 @@ entry:
 }
 
 attributes #0 = { nounwind }
+
+@g8 = global i8 0
+
+; The following test failed because llvm had a bug where a structure like:
+;
+; %vreg12<def> = CMOV_GR8 %vreg7, %vreg11 ... (lt)
+; %vreg13<def> = CMOV_GR8 %vreg12, %vreg11 ... (gt)
+;
+; was lowered to:
+;
+; The first two cmovs got expanded to:
+; BB#0:
+;   JL_1 BB#9
+; BB#7:
+;   JG_1 BB#9
+; BB#8:
+; BB#9:
+;   vreg12 = phi(vreg7, BB#8, vreg11, BB#0, vreg12, BB#7)
+;   vreg13 = COPY vreg12
+; Which was invalid as %vreg12 is not the same value as %vreg13
+
+; CHECK-LABEL: no_cascade_opt:
+; CMOV-DAG: cmpl %edx, %esi
+; CMOV-DAG: movb $20, %al
+; CMOV-DAG: movb $20, %dl
+; CMOV:   jl [[BB0:.LBB[0-9_]+]]
+; CMOV:   movb %cl, %dl
+; CMOV: [[BB0]]:
+; CMOV:   jg [[BB1:.LBB[0-9_]+]]
+; CMOV:   movb %dl, %al
+; CMOV: [[BB1]]:
+; CMOV:   testl %edi, %edi
+; CMOV:   je [[BB2:.LBB[0-9_]+]]
+; CMOV:   movb %dl, %al
+; CMOV: [[BB2]]:
+; CMOV:   movb %al, g8(%rip)
+; CMOV:   retq
+define void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) {
+entry:
+  %c0 = icmp eq i32 %v0, 0
+  %c1 = icmp slt i32 %v1, %v2
+  %c2 = icmp sgt i32 %v1, %v2
+  %trunc = trunc i32 %v3 to i8
+  %sel0 = select i1 %c1, i8 20, i8 %trunc
+  %sel1 = select i1 %c2, i8 20, i8 %sel0
+  %sel2 = select i1 %c0, i8 %sel1, i8 %sel0
+  store volatile i8 %sel2, i8* @g8
+  ret void
+}
diff --git a/test/Transforms/GCOVProfiling/modules.ll b/test/Transforms/GCOVProfiling/modules.ll
new file mode 100644
index 0000000..1a8edfe
--- /dev/null
+++ b/test/Transforms/GCOVProfiling/modules.ll
@@ -0,0 +1,12 @@
+; RUN: opt -insert-gcov-profiling -o - < %s | llvm-dis | FileCheck -check-prefix=EMIT-ARCS %s
+
+; EMIT-ARCS-NOT: call void @llvm_gcda_start_file
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !2, globals: !2, imports: !2, dwoId: 43981)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/Transforms/IndVarSimplify/pr26207.ll b/test/Transforms/IndVarSimplify/pr26207.ll
new file mode 100644
index 0000000..9d351e0
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr26207.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @main(i16 %in) {
+; CHECK-LABEL: @main(
+  br label %bb2
+
+bb2:                                              ; preds = %bb1.i, %bb2, %0
+  %_tmp44.i = icmp slt i16 %in, 2
+  br i1 %_tmp44.i, label %bb1.i, label %bb2
+
+bb1.i:                                            ; preds = %bb1.i, %bb2
+  %_tmp25.i = phi i16 [ %in, %bb2 ], [ %_tmp6.i, %bb1.i ]
+  %_tmp6.i = add nsw i16 %_tmp25.i, 1
+  %_tmp10.i = icmp sge i16 %_tmp6.i, 2
+  %exitcond.i = icmp eq i16 %_tmp6.i, 2
+  %or.cond = and i1 %_tmp10.i, %exitcond.i
+  br i1 %or.cond, label %bb2, label %bb1.i
+}
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index c3884ba..c5fe631 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -267,56 +267,36 @@ function export_sources() {
     check_valid_urls
 
     for proj in $projects ; do
-        if [ -d $proj.src ]; then
-          echo "# Reusing $proj $Release-$RC sources"
+        case $proj in
+        llvm)
+            projsrc=$proj.src
+            ;;
+        cfe)
+            projsrc=llvm.src/tools/clang
+            ;;
+        clang-tools-extra)
+            projsrc=llvm.src/tools/clang/tools/extra
+            ;;
+        compiler-rt|libcxx|libcxxabi|libunwind|openmp|test-suite)
+            projsrc=llvm.src/projects/$proj
+            ;;
+        *)
+            echo "error: unknown project $proj"
+            exit 1
+            ;;
+        esac
+
+        if [ -d $projsrc ]; then
+          echo "# Reusing $proj $Release-$RC sources in $projsrc"
           continue
         fi
-        echo "# Exporting $proj $Release-$RC sources"
-        if ! svn export -q $Base_url/$proj/$ExportBranch $proj.src ; then
+        echo "# Exporting $proj $Release-$RC sources to $projsrc"
+        if ! svn export -q $Base_url/$proj/$ExportBranch $projsrc ; then
             echo "error: failed to export $proj project"
             exit 1
         fi
     done
 
-    echo "# Creating symlinks"
-    cd $BuildDir/llvm.src/tools
-    if [ ! -h clang ]; then
-        ln -s ../../cfe.src clang
-    fi
-
-    # The autoconf and CMake builds want different symlinks here:
-    if [ "$use_autoconf" = "yes" ]; then
-      cd $BuildDir/llvm.src/tools/clang/tools
-      if [ ! -h extra ]; then
-          ln -s ../../../../clang-tools-extra.src extra
-      fi
-    else
-      cd $BuildDir/cfe.src/tools
-      if [ ! -h extra ]; then
-          ln -s ../../clang-tools-extra.src extra
-      fi
-    fi
-
-    cd $BuildDir/llvm.src/projects
-    if [ -d $BuildDir/test-suite.src ] && [ ! -h test-suite ]; then
-        ln -s ../../test-suite.src test-suite
-    fi
-    if [ -d $BuildDir/compiler-rt.src ] && [ ! -h compiler-rt ]; then
-        ln -s ../../compiler-rt.src compiler-rt
-    fi
-    if [ -d $BuildDir/openmp.src ] && [ ! -h openmp ]; then
-        ln -s ../../openmp.src openmp
-    fi
-    if [ -d $BuildDir/libcxx.src ] && [ ! -h libcxx ]; then
-        ln -s ../../libcxx.src libcxx
-    fi
-    if [ -d $BuildDir/libcxxabi.src ] && [ ! -h libcxxabi ]; then
-        ln -s ../../libcxxabi.src libcxxabi
-    fi
-    if [ -d $BuildDir/libunwind.src ] && [ ! -h libunwind ]; then
-        ln -s ../../libunwind.src libunwind
-    fi
-
     cd $BuildDir
 }
 
@@ -560,8 +540,9 @@ for Flavor in $Flavors ; do
             # Substitute 'Phase2' for 'Phase3' in the Phase 2 object file in
             # case there are build paths in the debug info. On some systems,
             # sed adds a newline to the output, so pass $p3 through sed too.
-            if ! cmp -s <(sed -e 's,Phase2,Phase3,g' $p2) <(sed -e '' $p3) \
-                    16 16 ; then
+            if ! cmp -s \
+                <(env LC_CTYPE=C sed -e 's,Phase2,Phase3,g' $p2) \
+                <(env LC_CTYPE=C sed -e '' $p3) 16 16; then
                 echo "file `basename $p2` differs between phase 2 and phase 3"
             fi
         done
-- 
cgit v1.1


From 97a7b8a20a989eb4cf3d9465e1451de6cd05fa41 Mon Sep 17 00:00:00 2001
From: dim <dim@FreeBSD.org>
Date: Sat, 13 Feb 2016 14:57:10 +0000
Subject: Vendor import of llvm release_38 branch r260756:
 https://llvm.org/svn/llvm-project/llvm/branches/release_38@260756

---
 cmake/modules/AddLLVM.cmake                        |  36 +--
 cmake/modules/LLVM-Config.cmake                    |  11 +-
 docs/ReleaseNotes.rst                              | 148 ++++++++-
 include/llvm/IR/IntrinsicsPowerPC.td               |   2 +-
 include/llvm/IR/Value.h                            |   4 -
 lib/Analysis/DemandedBits.cpp                      |   7 -
 lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp     |   5 +
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp              |  55 +++-
 lib/IR/Value.cpp                                   |   4 +-
 lib/Target/AArch64/AArch64.td                      |   4 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp         |   3 +
 lib/Target/AArch64/AArch64SchedM1.td               | 359 +++++++++++++++++++++
 lib/Target/AMDGPU/AMDGPU.td                        |   3 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.h                |   3 +-
 lib/Target/AMDGPU/Processors.td                    |  12 +-
 lib/Target/AMDGPU/SIRegisterInfo.cpp               |   9 +-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp         |   3 +
 lib/Target/ARM/ARMISelDAGToDAG.cpp                 |   4 +-
 lib/Target/PowerPC/PPCFastISel.cpp                 |  18 +-
 lib/Target/PowerPC/PPCInstrAltivec.td              |   2 +-
 lib/Target/SystemZ/SystemZISelLowering.cpp         |   2 +-
 lib/Target/X86/X86ISelLowering.cpp                 |  65 ++--
 lib/Transforms/InstCombine/InstCombineCompares.cpp |   2 +-
 .../InstCombine/InstCombineLoadStoreAlloca.cpp     |   3 +-
 .../InstCombine/InstCombineVectorOps.cpp           |  18 +-
 lib/Transforms/Utils/SimplifyCFG.cpp               |  12 +
 test/Analysis/DemandedBits/basic.ll                |  31 --
 test/CodeGen/AArch64/fp16-v4-instructions.ll       | 274 ++++++++++++++++
 test/CodeGen/AArch64/fp16-v8-instructions.ll       |  84 +++++
 test/CodeGen/AMDGPU/hsa-note-no-func.ll            |   2 +
 test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll           |   1 +
 test/CodeGen/AMDGPU/spill-scavenge-offset.ll       |  33 ++
 test/CodeGen/ARM/shifter_operand.ll                |  17 +
 test/CodeGen/PowerPC/fast-isel-ret.ll              |   9 +
 test/CodeGen/PowerPC/inline-asm-s-modifier.ll      |  10 +
 test/CodeGen/PowerPC/pr26193.ll                    |   9 +
 test/CodeGen/PowerPC/pr26356.ll                    | 136 ++++++++
 test/CodeGen/PowerPC/pr26381.ll                    |   8 +
 test/CodeGen/SystemZ/int-cmp-53.ll                 |  26 ++
 test/CodeGen/X86/avx512-gather-scatter-intrin.ll   |  63 +++-
 test/CodeGen/X86/setcc-lowering.ll                 |  79 ++++-
 test/DebugInfo/X86/PR26148.ll                      | 102 ++++++
 test/Transforms/InstCombine/icmp.ll                |  12 +
 .../InstCombine/insert-extract-shuffle.ll          |  30 ++
 test/Transforms/InstCombine/unpack-fca.ll          |  15 +
 .../AArch64/loop-vectorization-factors.ll          |  34 --
 .../SimplifyCFG/X86/switch_to_lookup_table.ll      |  32 ++
 tools/CMakeLists.txt                               |   2 +-
 utils/release/test-release.sh                      |  26 +-
 utils/unittest/CMakeLists.txt                      |   7 +-
 utils/unittest/UnitTestMain/CMakeLists.txt         |   4 +-
 51 files changed, 1611 insertions(+), 229 deletions(-)
 create mode 100644 lib/Target/AArch64/AArch64SchedM1.td
 create mode 100644 test/CodeGen/AMDGPU/spill-scavenge-offset.ll
 create mode 100644 test/CodeGen/PowerPC/inline-asm-s-modifier.ll
 create mode 100644 test/CodeGen/PowerPC/pr26193.ll
 create mode 100644 test/CodeGen/PowerPC/pr26356.ll
 create mode 100644 test/CodeGen/PowerPC/pr26381.ll
 create mode 100644 test/CodeGen/SystemZ/int-cmp-53.ll
 create mode 100644 test/DebugInfo/X86/PR26148.ll

diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index b06e434..a829751 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -468,20 +468,23 @@ function(llvm_add_library name)
     endif()
   endif()
 
-  # Add the explicit dependency information for this library.
-  #
-  # It would be nice to verify that we have the dependencies for this library
-  # name, but using get_property(... SET) doesn't suffice to determine if a
-  # property has been set to an empty value.
-  get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name})
-
-  if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_STATIC AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
-    set(llvm_libs LLVM)
+  if (DEFINED LLVM_LINK_COMPONENTS OR DEFINED ARG_LINK_COMPONENTS)
+    if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
+      set(llvm_libs LLVM)
+    else()
+      llvm_map_components_to_libnames(llvm_libs
+       ${ARG_LINK_COMPONENTS}
+       ${LLVM_LINK_COMPONENTS}
+       )
+    endif()
   else()
-    llvm_map_components_to_libnames(llvm_libs
-      ${ARG_LINK_COMPONENTS}
-      ${LLVM_LINK_COMPONENTS}
-      )
+    # Components have not been defined explicitly in CMake, so add the
+    # dependency information for this library as defined by LLVMBuild.
+    #
+    # It would be nice to verify that we have the dependencies for this library
+    # name, but using get_property(... SET) doesn't suffice to determine if a
+    # property has been set to an empty value.
+    get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name})
   endif()
 
   if(CMAKE_VERSION VERSION_LESS 2.8.12)
@@ -882,14 +885,11 @@ function(add_unittest test_suite test_name)
 
   set(LLVM_REQUIRES_RTTI OFF)
 
+  list(APPEND LLVM_LINK_COMPONENTS Support) # gtest needs it for raw_ostream
   add_llvm_executable(${test_name} IGNORE_EXTERNALIZE_DEBUGINFO ${ARGN})
   set(outdir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR})
   set_output_directory(${test_name} BINARY_DIR ${outdir} LIBRARY_DIR ${outdir})
-  target_link_libraries(${test_name}
-    gtest
-    gtest_main
-    LLVMSupport # gtest needs it for raw_ostream.
-    )
+  target_link_libraries(${test_name} gtest_main gtest)
 
   add_dependencies(${test_suite} ${test_name})
   get_target_property(test_suite_folder ${test_suite} FOLDER)
diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake
index aa68b40..725178a 100755
--- a/cmake/modules/LLVM-Config.cmake
+++ b/cmake/modules/LLVM-Config.cmake
@@ -40,10 +40,19 @@ macro(llvm_config executable)
     # done in case libLLVM does not contain all of the components
     # the target requires.
     #
-    # TODO strip LLVM_DYLIB_COMPONENTS out of link_components.
+    # Strip LLVM_DYLIB_COMPONENTS out of link_components.
     # To do this, we need special handling for "all", since that
     # may imply linking to libraries that are not included in
     # libLLVM.
+
+    if (DEFINED link_components AND DEFINED LLVM_DYLIB_COMPONENTS)
+      if("${LLVM_DYLIB_COMPONENTS}" STREQUAL "all")
+        set(link_components "")
+      else()
+        list(REMOVE_ITEM link_components ${LLVM_DYLIB_COMPONENTS})
+      endif()
+    endif()
+
     target_link_libraries(${executable} LLVM)
   endif()
 
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index dccb7f4..7b284d5 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -5,11 +5,6 @@ LLVM 3.8 Release Notes
 .. contents::
     :local:
 
-.. warning::
-   These are in-progress notes for the upcoming LLVM 3.8 release.  You may
-   prefer the `LLVM 3.7 Release Notes <http://llvm.org/releases/3.7.0/docs
-   /ReleaseNotes.html>`_.
-
 
 Introduction
 ============
@@ -26,11 +21,6 @@ have questions or comments, the `LLVM Developer's Mailing List
 <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
 them.
 
-Note that if you are reading this file from a Subversion checkout or the main
-LLVM web page, this document applies to the *next* release, not the current
-one.  To see the release notes for a specific release, please see the `releases
-page <http://llvm.org/releases/>`_.
-
 Non-comprehensive list of changes in this release
 =================================================
 * With this release, the minimum Windows version required for running LLVM is
@@ -79,6 +69,26 @@ Non-comprehensive list of changes in this release
 
 * Support for dematerializing has been dropped.
 
+* RegisterScheduler::setDefault was removed. Targets that used to call into the
+  command line parser to set the DAGScheduler, and that don't have enough
+  control with setSchedulingPreference, should look into overriding the
+  SubTargetHook "getDAGScheduler()".
+
+* ``ilist_iterator<T>`` no longer has implicit conversions to and from ``T*``,
+  since ``ilist_iterator<T>`` may be pointing at the sentinel (which is usually
+  not of type ``T`` at all).  To convert from an iterator ``I`` to a pointer,
+  use ``&*I``; to convert from a pointer ``P`` to an iterator, use
+  ``P->getIterator()``.  Alternatively, explicit conversions via
+  ``static_cast<T>(U)`` are still available.
+
+* ``ilist_node<T>::getNextNode()`` and ``ilist_node<T>::getPrevNode()`` now
+  fail at compile time when the node cannot access its parent list.
+  Previously, when the sentinel was was an ``ilist_half_node<T>``, this API
+  could return the sentinal instead of ``nullptr``.  Frustrated callers should
+  be updated to use ``iplist<T>::getNextNode(T*)`` instead.  Alternatively, if
+  the node ``N`` is guaranteed not to be the last in the list, it is safe to
+  call ``&*++N->getIterator()`` directly.
+
 .. NOTE
    For small 1-3 sentence descriptions, just add an entry at the end of
    this list. If your description won't fit comfortably in one bullet
@@ -98,17 +108,97 @@ Non-comprehensive list of changes in this release
 
    Makes programs 10x faster by doing Special New Thing.
 
-Changes to the ARM Backend
---------------------------
 
- During this release ...
+Changes to the ARM Backends
+---------------------------
+
+During this release the AArch64 target has:
+
+* Added support for more sanitizers (MSAN, TSAN) and made them compatible with
+  all VMA kernel configurations (kurrently tested on 39 and 42 bits).
+* Gained initial LLD support in the new ELF back-end
+* Extended the Load/Store optimiser and cleaned up some of the bad decisions
+  made earlier.
+* Expanded LLDB support, including watchpoints, native building, Renderscript,
+  LLDB-server, debugging 32-bit applications.
+* Added support for the ``Exynos M1`` chip.
+
+During this release the ARM target has:
+
+* Gained massive performance improvements on embedded benchmarks due to finally
+  running the stride vectorizer in full form, incrementing the performance gains
+  that we already had in the previous releases with limited stride vectorization.
+* Expanded LLDB support, including watchpoints, unwind tables
+* Extended the Load/Store optimiser and cleaned up some of the bad decisions
+  made earlier.
+* Simplified code generation for global variable addresses in ELF, resulting in
+  a significant (4% in Chromium) reduction in code size.
+* Gained some additional code size improvements, though there's still a long road
+  ahead, especially for older cores.
+* Added some EABI floating point comparison functions to Compiler-RT
+* Added support for Windows+GNU triple, +features in -mcpu/-march options.
 
 
 Changes to the MIPS Target
 --------------------------
 
- During this release ...
-
+During this release the MIPS target has:
+
+* Significantly extended support for the Integrated Assembler. See below for
+  more information
+* Added support for the ``P5600`` processor.
+* Added support for the ``interrupt`` attribute for MIPS32R2 and later. This
+  attribute will generate a function which can be used as a interrupt handler
+  on bare metal MIPS targets using the static relocation model.
+* Added support for the ``ERETNC`` instruction found in MIPS32R5 and later.
+* Added support for OpenCL. See http://portablecl.org/.
+
+  * Address spaces 1 to 255 are now reserved for software use and conversions
+    between them are no-op casts.
+
+* Removed the ``mips16`` value for the -mcpu option since it is an :abbr:`ASE
+  (Application Specific Extension)` and not a processor. If you were using this,
+  please specify another CPU and use ``-mips16`` to enable MIPS16.
+* Removed ``copy_u.w`` from 32-bit MSA and ``copy_u.d`` from 64-bit MSA since
+  they have been removed from the MSA specification due to forward compatibility
+  issues.  For example, 32-bit MSA code containing ``copy_u.w`` would behave
+  differently on a 64-bit processor supporting MSA. The corresponding intrinsics
+  are still available and may expand to ``copy_s.[wd]`` where this is
+  appropriate for forward compatibility purposes.
+* Relaxed the ``-mnan`` option to allow ``-mnan=2008`` on MIPS32R2/MIPS64R2 for
+  compatibility with GCC.
+* Made MIPS64R6 the default CPU for 64-bit Android triples.
+
+The MIPS target has also fixed various bugs including the following notable
+fixes:
+
+* Fixed reversed operands on ``mthi``/``mtlo`` in the DSP :abbr:`ASE
+  (Application Specific Extension)`.
+* The code generator no longer uses ``jal`` for calls to absolute immediate
+  addresses.
+* Disabled fast instruction selection on MIPS32R6 and MIPS64R6 since this is not
+  yet supported.
+* Corrected addend for ``R_MIPS_HI16`` and ``R_MIPS_PCHI16`` in MCJIT
+* The code generator no longer crashes when handling subregisters of an 64-bit
+  FPU register with undefined value.
+* The code generator no longer attempts to use ``$zero`` for operands that do
+  not permit ``$zero``.
+* Corrected the opcode used for ``ll``/``sc`` when using MIPS32R6/MIPS64R6 and
+  the Integrated Assembler.
+* Added support for atomic load and atomic store.
+* Corrected debug info when dynamically re-aligning the stack.
+
+Integrated Assembler
+^^^^^^^^^^^^^^^^^^^^
+We have made a large number of improvements to the integrated assembler for
+MIPS. In this release, the integrated assembler isn't quite production-ready
+since there are a few known issues related to bare-metal support, checking
+immediates on instructions, and the N32/N64 ABI's. However, the current support
+should be sufficient for many users of the O32 ABI, particularly those targeting
+MIPS32 on Linux or bare-metal MIPS32.
+
+If you would like to try the integrated assembler, please use
+``-fintegrated-as``.
 
 Changes to the PowerPC Target
 -----------------------------
@@ -123,6 +213,20 @@ Changes to the X86 Target
 
 * TLS is enabled for Cygwin as emutls.
 
+* Smaller code for materializing 32-bit 1 and -1 constants at ``-Os``.
+
+* More efficient code for wide integer compares. (E.g. 64-bit compares
+  on 32-bit targets.)
+
+* Tail call support for ``thiscall``, ``stdcall`, ``vectorcall``, and
+  ``fastcall`` functions.
+
+Changes to the AVR Target
+-------------------------
+
+Slightly less than half of the AVR backend has been merged in at this point. It is still
+missing a number large parts which cause it to be unusable, but is well on the
+road to being completely merged and workable.
 
 Changes to the OCaml bindings
 -----------------------------
@@ -140,7 +244,19 @@ An exciting aspect of LLVM is that it is used as an enabling technology for
 a lot of other language and tools projects. This section lists some of the
 projects that have already been updated to work with LLVM 3.8.
 
-* A project
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X and Windows and also PowerPC (32/64 bit)
+and ARM. Ports to other architectures like AArch64 and MIPS64 are underway.
 
 
 Additional Information
diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td
index 06dfc32..5512b10 100644
--- a/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/include/llvm/IR/IntrinsicsPowerPC.td
@@ -484,7 +484,7 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
             Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                       [IntrNoMem]>;
   def int_ppc_altivec_vpkswss : GCCBuiltin<"__builtin_altivec_vpkswss">,
-            Intrinsic<[llvm_v16i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+            Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                       [IntrNoMem]>;
   def int_ppc_altivec_vpkswus : GCCBuiltin<"__builtin_altivec_vpkswus">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index bb7ff27..8918dcd 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -280,11 +280,7 @@ public:
   // when using them since you might not get all uses.
   // The methods that don't start with materialized_ assert that modules is
   // fully materialized.
-#ifdef NDEBUG
-  void assertModuleIsMaterialized() const {}
-#else
   void assertModuleIsMaterialized() const;
-#endif
 
   bool use_empty() const {
     assertModuleIsMaterialized();
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 143d0b7..6f92ba6 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -242,13 +242,6 @@ void DemandedBits::determineLiveOperandBits(
     if (OperandNo != 0)
       AB = AOut;
     break;
-  case Instruction::ICmp:
-    // Count the number of leading zeroes in each operand.
-    ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1));
-    auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(),
-                                     KnownZero2.countLeadingOnes());
-    AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes);
-    break;
   }
 }
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 4171657..5633aa4 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -555,6 +555,11 @@ bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
         return true;
       O << -MO.getImm();
       return false;
+    case 's':  // The GCC deprecated s modifier
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << ((32 - MO.getImm()) & 31);
+      return false;
     }
   }
   return true;
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index ae62b6b..f56c8e4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -793,16 +793,27 @@ static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
   llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
 }
 
-/// Determine whether two variable pieces overlap.
-static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
-  if (!P1->isBitPiece() || !P2->isBitPiece())
-    return true;
+// Determine the relative position of the pieces described by P1 and P2.
+// Returns  -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
+// 1 if P1 is entirely after P2.
+static int pieceCmp(const DIExpression *P1, const DIExpression *P2) {
   unsigned l1 = P1->getBitPieceOffset();
   unsigned l2 = P2->getBitPieceOffset();
   unsigned r1 = l1 + P1->getBitPieceSize();
   unsigned r2 = l2 + P2->getBitPieceSize();
-  // True where [l1,r1[ and [r1,r2[ overlap.
-  return (l1 < r2) && (l2 < r1);
+  if (r1 <= l2)
+    return -1;
+  else if (r2 <= l1)
+    return 1;
+  else
+    return 0;
+}
+
+/// Determine whether two variable pieces overlap.
+static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
+  if (!P1->isBitPiece() || !P2->isBitPiece())
+    return true;
+  return pieceCmp(P1, P2) == 0;
 }
 
 /// \brief If this and Next are describing different pieces of the same
@@ -811,14 +822,32 @@ static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
 /// Return true if the merge was successful.
 bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) {
   if (Begin == Next.Begin) {
-    auto *Expr = cast_or_null<DIExpression>(Values[0].Expression);
-    auto *NextExpr = cast_or_null<DIExpression>(Next.Values[0].Expression);
-    if (Expr->isBitPiece() && NextExpr->isBitPiece() &&
-        !piecesOverlap(Expr, NextExpr)) {
-      addValues(Next.Values);
-      End = Next.End;
-      return true;
+    auto *FirstExpr = cast<DIExpression>(Values[0].Expression);
+    auto *FirstNextExpr = cast<DIExpression>(Next.Values[0].Expression);
+    if (!FirstExpr->isBitPiece() || !FirstNextExpr->isBitPiece())
+      return false;
+
+    // We can only merge entries if none of the pieces overlap any others.
+    // In doing so, we can take advantage of the fact that both lists are
+    // sorted.
+    for (unsigned i = 0, j = 0; i < Values.size(); ++i) {
+      for (; j < Next.Values.size(); ++j) {
+        int res = pieceCmp(cast<DIExpression>(Values[i].Expression),
+                           cast<DIExpression>(Next.Values[j].Expression));
+        if (res == 0) // The two expressions overlap, we can't merge.
+          return false;
+        // Values[i] is entirely before Next.Values[j],
+        // so go back to the next entry of Values.
+        else if (res == -1)
+          break;
+        // Next.Values[j] is entirely before Values[i], so go on to the
+        // next entry of Next.Values.
+      }
     }
+
+    addValues(Next.Values);
+    End = Next.End;
+    return true;
   }
   return false;
 }
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index eb9deb6..4d224a0 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -313,8 +313,8 @@ void Value::takeName(Value *V) {
     ST->reinsertValue(this);
 }
 
-#ifndef NDEBUG
 void Value::assertModuleIsMaterialized() const {
+#ifndef NDEBUG
   const GlobalValue *GV = dyn_cast<GlobalValue>(this);
   if (!GV)
     return;
@@ -322,8 +322,10 @@ void Value::assertModuleIsMaterialized() const {
   if (!M)
     return;
   assert(M->isMaterialized());
+#endif
 }
 
+#ifndef NDEBUG
 static bool contains(SmallPtrSetImpl<ConstantExpr *> &Cache, ConstantExpr *Expr,
                      Constant *C) {
   if (!Cache.insert(Expr).second)
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 46ef2c1..cd3e84d 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -90,6 +90,7 @@ def AArch64InstrInfo : InstrInfo;
 include "AArch64SchedA53.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
+include "AArch64SchedM1.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                    "Cortex-A35 ARM processors",
@@ -144,8 +145,7 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
 // FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
 def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
-// FIXME: Exynos-M1 is currently modelled without a specific SchedModel.
-def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>;
+def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9b73c5e..92cf1cd 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6689,6 +6689,9 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
   }
 
+  if (LHS.getValueType().getVectorElementType() == MVT::f16)
+    return SDValue();
+
   assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
          LHS.getValueType().getVectorElementType() == MVT::f64);
 
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td
new file mode 100644
index 0000000..6525628
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedM1.td
@@ -0,0 +1,359 @@
+//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Samsung Exynos-M1 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The Exynos-M1 is a traditional superscalar microprocessor with a
+// 4-wide in-order stage for decode and dispatch and a wider issue stage.
+// The execution units and loads and stores are out-of-order.
+
+def ExynosM1Model : SchedMachineModel {
+  let IssueWidth            =  4; // Up to 4 uops per cycle.
+  let MinLatency            =  0; // OoO.
+  let MicroOpBufferSize     = 96; // ROB size.
+  let LoopMicroOpBufferSize = 32; // Instruction queue size.
+  let LoadLatency           =  4; // Optimistic load cases.
+  let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
+  let CompleteModel         =  0; // Use the default model otherwise.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on the Exynos-M1,
+// which has 9 pipelines, each with its own queue with out-of-order dispatch.
+
+def M1UnitA  : ProcResource<2>; // Simple integer
+def M1UnitC  : ProcResource<1>; // Simple and complex integer
+def M1UnitB  : ProcResource<2>; // Branch
+def M1UnitL  : ProcResource<1>; // Load
+def M1UnitS  : ProcResource<1>; // Store
+def M1PipeF0 : ProcResource<1>; // FP #0
+def M1PipeF1 : ProcResource<1>; // FP #1
+
+let Super = M1PipeF0 in {
+  def M1UnitFMAC   : ProcResource<1>; // FP multiplication
+  def M1UnitFCVT   : ProcResource<1>; // FP conversion
+  def M1UnitNAL0   : ProcResource<1>; // Simple vector.
+  def M1UnitNMISC  : ProcResource<1>; // Miscellanea
+  def M1UnitNCRYPT : ProcResource<1>; // Cryptographic
+}
+
+let Super = M1PipeF1 in {
+  def M1UnitFADD : ProcResource<1>; // Simple FP
+  let BufferSize = 1 in
+  def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized)
+  def M1UnitNAL1 : ProcResource<1>; // Simple vector.
+  def M1UnitFST  : ProcResource<1>; // FP store
+}
+
+let SchedModel = ExynosM1Model in {
+  def M1UnitALU  : ProcResGroup<[M1UnitA,
+                                 M1UnitC]>;    // All simple integer.
+  def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
+                                 M1UnitNAL1]>; // All simple vector.
+}
+
+let SchedModel = ExynosM1Model in {
+
+//===----------------------------------------------------------------------===//
+// Coarse scheduling model for the Exynos-M1.
+
+// Branch instructions.
+// TODO: Non-conditional direct branches take zero cycles and units.
+def : WriteRes<WriteBr,    [M1UnitB]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; }
+// TODO: Branch and link is much different.
+
+// Arithmetic and logical integer instructions.
+def : WriteRes<WriteI,     [M1UnitALU]> { let Latency = 1; }
+// TODO: Shift over 3 and some extensions take 2 cycles.
+def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIS,    [M1UnitALU]> { let Latency = 1; }
+
+// Move instructions.
+def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; }
+
+// Divide and multiply instructions.
+// TODO: Division blocks the divider inside C.
+def : WriteRes<WriteID32, [M1UnitC]> { let Latency = 13; }
+def : WriteRes<WriteID64, [M1UnitC]> { let Latency = 21; }
+// TODO: Long multiplication take 5 cycles and also the ALU.
+// TODO: Multiplication with accumulation can be advanced.
+def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; }
+// TODO: 64-bit multiplication has a throughput of 1/2.
+def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; }
+
+// Miscellaneous instructions.
+def : WriteRes<WriteExtr, [M1UnitALU,
+                           M1UnitALU]> { let Latency = 2; }
+
+// TODO: The latency for the post or pre register is 1 cycle.
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Load instructions.
+def : WriteRes<WriteLD,    [M1UnitL]>   { let Latency = 4; }
+// TODO: Extended address requires also the ALU.
+def : WriteRes<WriteLDIdx, [M1UnitL]>   { let Latency = 5; }
+def : WriteRes<WriteLDHi,  [M1UnitALU]> { let Latency = 4; }
+
+// Store instructions.
+def : WriteRes<WriteST,    [M1UnitS]> { let Latency = 1; }
+// TODO: Extended address requires also the ALU.
+def : WriteRes<WriteSTIdx, [M1UnitS]> { let Latency = 1; }
+def : WriteRes<WriteSTP,   [M1UnitS]> { let Latency = 1; }
+def : WriteRes<WriteSTX,   [M1UnitS]> { let Latency = 1; }
+
+// FP data instructions.
+def : WriteRes<WriteF,    [M1UnitFADD]>  { let Latency = 3; }
+// TODO: FCCMP is much different.
+def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; }
+// TODO: DP takes longer.
+def : WriteRes<WriteFDiv, [M1UnitFVAR]>  { let Latency = 15; }
+// TODO: MACC takes longer.
+def : WriteRes<WriteFMul, [M1UnitFMAC]>  { let Latency = 4; }
+
+// FP miscellaneous instructions.
+// TODO: Conversion between register files is much different.
+def : WriteRes<WriteFCvt,  [M1UnitFCVT]> { let Latency = 3; }
+def : WriteRes<WriteFImm,  [M1UnitNALU]> { let Latency = 1; }
+// TODO: Copy from FPR to GPR is much different.
+def : WriteRes<WriteFCopy, [M1UnitS]>    { let Latency = 4; }
+
+// FP load instructions.
+// TODO: ASIMD loads are much different.
+def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; }
+
+// FP store instructions.
+// TODO: ASIMD stores are much different.
+def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
+
+// ASIMD FP instructions.
+// TODO: Other operations are much different.
+def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
+
+// Other miscellaneous instructions.
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Fast forwarding.
+
+// TODO: Add FP register forwarding rules.
+
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+// Integer multiply-accumulate.
+// TODO: The forwarding for WriteIM64 saves actually 3 cycles.
+def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// Finer scheduling model for the Exynos-M1.
+
+def M1WriteNEONA   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitFADD]>   { let Latency = 9; }
+def M1WriteNEONB   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST]>    { let Latency = 5; }
+def M1WriteNEONC   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST]>    { let Latency = 6; }
+def M1WriteNEOND   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST,
+                                    M1UnitL]>      { let Latency = 10; }
+def M1WriteNEONE   : SchedWriteRes<[M1UnitFCVT,
+                                    M1UnitFST]>    { let Latency = 8; }
+def M1WriteNEONF   : SchedWriteRes<[M1UnitFCVT,
+                                    M1UnitFST,
+                                    M1UnitL]>      { let Latency = 13; }
+def M1WriteNEONG   : SchedWriteRes<[M1UnitNMISC,
+                                    M1UnitFST]>    { let Latency = 6; }
+def M1WriteNEONH   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST]>    { let Latency = 3; }
+def M1WriteNEONI   : SchedWriteRes<[M1UnitFST,
+                                    M1UnitL]>      { let Latency = 9; }
+def M1WriteALU1    : SchedWriteRes<[M1UnitALU]>    { let Latency = 1; }
+def M1WriteB       : SchedWriteRes<[M1UnitB]>      { let Latency = 1; }
+// FIXME: This is the worst case, conditional branch and link.
+def M1WriteBL      : SchedWriteRes<[M1UnitB,
+                                    M1UnitALU]>    { let Latency = 1; }
+// FIXME: This is the worst case, when using LR.
+def M1WriteBLR     : SchedWriteRes<[M1UnitB,
+                                    M1UnitALU,
+                                    M1UnitALU]>    { let Latency = 2; }
+def M1WriteC1      : SchedWriteRes<[M1UnitC]>      { let Latency = 1; }
+def M1WriteC2      : SchedWriteRes<[M1UnitC]>      { let Latency = 2; }
+def M1WriteFADD3   : SchedWriteRes<[M1UnitFADD]>   { let Latency = 3; }
+def M1WriteFCVT3   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 3; }
+def M1WriteFCVT4   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 4; }
+def M1WriteFMAC4   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 4; }
+def M1WriteFMAC5   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 5; }
+def M1WriteFVAR15  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 15; }
+def M1WriteFVAR23  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 23; }
+def M1WriteNALU1   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 1; }
+def M1WriteNALU2   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 2; }
+def M1WriteNAL11   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 1; }
+def M1WriteNAL12   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 2; }
+def M1WriteNAL13   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 3; }
+def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
+def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; }
+def M1WriteNMISC1  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 1; }
+def M1WriteNMISC2  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 2; }
+def M1WriteNMISC3  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 3; }
+def M1WriteNMISC4  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 4; }
+def M1WriteS4      : SchedWriteRes<[M1UnitS]>      { let Latency = 4; }
+def M1WriteTB      : SchedWriteRes<[M1UnitC,
+                                    M1UnitALU]>    { let Latency = 2; }
+
+// Branch instructions
+def : InstRW<[M1WriteB ],  (instrs Bcc)>;
+def : InstRW<[M1WriteBL],  (instrs BL)>;
+def : InstRW<[M1WriteBLR], (instrs BLR)>;
+def : InstRW<[M1WriteC1],  (instregex "^CBN?Z[WX]")>;
+def : InstRW<[M1WriteTB],  (instregex "^TBN?Z[WX]")>;
+
+// Arithmetic and logical integer instructions.
+def : InstRW<[M1WriteALU1], (instrs COPY)>;
+
+// Divide and multiply instructions.
+
+// Miscellaneous instructions.
+
+// Load instructions.
+
+// Store instructions.
+
+// FP data instructions.
+def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)[DS]r")>;
+def : InstRW<[M1WriteFADD3],  (instregex "^F(ADD|SUB)[DS]rr")>;
+def : InstRW<[M1WriteNEONG],  (instregex "^FCCMPE?[DS]rr")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>;
+def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>;
+def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>;
+def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>;
+def : InstRW<[M1WriteFMAC4],  (instregex "^FN?MUL[DS]rr")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT.+r")>;
+def : InstRW<[M1WriteNEONH],  (instregex "^FCSEL[DS]rrr")>;
+def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>;
+def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>;
+
+// FP miscellaneous instructions.
+def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>;
+def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
+def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>;
+def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>;
+def : InstRW<[M1WriteS4],    (instregex "^FMOV[WX][DS](High)?r")>;
+def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>;
+
+// FP load instructions.
+
+// FP store instructions.
+
+// ASIMD instructions.
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(ADD|NEG|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^CMTSTv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>;
+def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^(S|SR|U|UR)SRAv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^[SU]?SH(L|LL|R)2?v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^S[LR]Iv")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^[SU](Q|QR|R)SHLU?v")>;
+
+// ASIMD FP instructions.
+def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>;
+def : InstRW<[M1WriteNEONA],  (instregex "^FADDP")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
+def : InstRW<[M1WriteFCVT3],  (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>;
+def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>;
+def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>;
+def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
+def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
+def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
+def : InstRW<[M1WriteFMAC4],  (instregex "^FMULX?v")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^FML[AS]v")>;
+def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT[AIMNPXZ]v")>;
+
+// ASIMD miscellaneous instructions.
+def : InstRW<[M1WriteNALU1],  (instregex "^RBITv")>;
+def : InstRW<[M1WriteNAL11],  (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^CPY")>;
+def : InstRW<[M1WriteNEONB],  (instregex "^DUPv.+gpr")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^DUPv.+lane")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?Q?XTU?Nv")>;
+def : InstRW<[M1WriteNEONC],  (instregex "^INSv.+gpr")>;
+def : InstRW<[M1WriteFCVT4],  (instregex "^[FU](RECP|RSQRT)Ev")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^F(RECP|RSQRT)Sv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^REV(16|32|64)v")>;
+def : InstRW<[M1WriteNAL11],  (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 2>],
+                              (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 3>],
+                              (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 4>],
+                              (instregex "^TB[LX]v8i8Four")>;
+def : InstRW<[M1WriteNAL12],  (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 2>],
+                              (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 3>],
+                              (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
+                              (instregex "^TB[LX]v16i8Four")>;
+def : InstRW<[M1WriteNEOND],  (instregex "^[SU]MOVv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^INSv.+lane")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>;
+def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^ZIP(1|2)v")>;
+
+// ASIMD load instructions.
+
+// ASIMD store instructions.
+
+// Cryptography instructions.
+def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
+def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>;
+def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>;
+
+// CRC instructions.
+def : InstRW<[M1WriteC2], (instregex "^CRC32")>;
+
+} // SchedModel = ExynosM1Model
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 79c6604..844d89c 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -183,6 +183,7 @@ def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>;
 def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>;
 def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>;
 def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>;
+def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>;
 
 class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
         "localmemorysize"#Value,
@@ -252,7 +253,7 @@ def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
 def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
         [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
          FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
-         FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>;
+         FeatureGCN3Encoding, FeatureCIInsts]>;
 
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 4796e9e..49c94f1 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -53,7 +53,8 @@ public:
     ISAVersion7_0_0,
     ISAVersion7_0_1,
     ISAVersion8_0_0,
-    ISAVersion8_0_1
+    ISAVersion8_0_1,
+    ISAVersion8_0_3
   };
 
 private:
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index a1584a2..4300d97 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -128,21 +128,23 @@ def : ProcessorModel<"mullins",    SIQuarterSpeedModel,
 //===----------------------------------------------------------------------===//
 
 def : ProcessorModel<"tonga",   SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
+  [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
+   FeatureLDSBankCount32]
 >;
 
 def : ProcessorModel<"iceland", SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
+  [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
+   FeatureLDSBankCount32]
 >;
 
 def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
 >;
 
 def : ProcessorModel<"fiji", SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_3, FeatureLDSBankCount32]
 >;
 
 def : ProcessorModel<"stoney", SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16]
 >;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 609f5e7..025ed2b 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -234,6 +234,7 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
   bool IsLoad = TII->get(LoadStoreOp).mayLoad();
 
   bool RanOutOfSGPRs = false;
+  bool Scavenged = false;
   unsigned SOffset = ScratchOffset;
 
   unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
@@ -244,6 +245,8 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
     if (SOffset == AMDGPU::NoRegister) {
       RanOutOfSGPRs = true;
       SOffset = AMDGPU::SGPR0;
+    } else {
+      Scavenged = true;
     }
     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
             .addReg(ScratchOffset)
@@ -259,10 +262,14 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
         getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
         Value;
 
+    unsigned SOffsetRegState = 0;
+    if (i + 1 == e && Scavenged)
+      SOffsetRegState |= RegState::Kill;
+
     BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
       .addReg(SubReg, getDefRegState(IsLoad))
       .addReg(ScratchRsrcReg)
-      .addReg(SOffset)
+      .addReg(SOffset, SOffsetRegState)
       .addImm(Offset)
       .addImm(0) // glc
       .addImm(0) // slc
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3b4c235..1f5deae 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -41,6 +41,9 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
   if (Features.test(FeatureISAVersion8_0_1))
     return {8, 0, 1};
 
+  if (Features.test(FeatureISAVersion8_0_3))
+    return {8, 0, 3};
+
   return {0, 0, 0};
 }
 
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index dfbb969..6e7edbf 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -747,7 +747,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
 
   // If Offset is a multiply-by-constant and it's profitable to extract a shift
   // and use it in a shifted operand do so.
-  if (Offset.getOpcode() == ISD::MUL) {
+  if (Offset.getOpcode() == ISD::MUL && N.hasOneUse()) {
     unsigned PowerOfTwo = 0;
     SDValue NewMulConst;
     if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
@@ -1422,7 +1422,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
 
   // If OffReg is a multiply-by-constant and it's profitable to extract a shift
   // and use it in a shifted operand do so.
-  if (OffReg.getOpcode() == ISD::MUL) {
+  if (OffReg.getOpcode() == ISD::MUL && N.hasOneUse()) {
     unsigned PowerOfTwo = 0;
     SDValue NewMulConst;
     if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index b451ebf..16dcd46 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -1615,7 +1615,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
       // extension rather than sign extension. Make sure we pass the return
       // value extension property to integer materialization.
       unsigned SrcReg =
-          PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt);
+          PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() != CCValAssign::ZExt);
 
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
@@ -2091,25 +2091,21 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
 
   const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
                                    &PPC::GPRCRegClass);
+  int64_t Imm = UseSExt ? CI->getSExtValue() : CI->getZExtValue();
 
   // If the constant is in range, use a load-immediate.
-  if (UseSExt && isInt<16>(CI->getSExtValue())) {
+  // Since LI will sign extend the constant we need to make sure that for
+  // our zeroext constants that the sign extended constant fits into 16-bits -
+  // a range of 0..0x7fff.
+  if (isInt<16>(Imm)) {
     unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
     unsigned ImmReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
-        .addImm(CI->getSExtValue());
-    return ImmReg;
-  } else if (!UseSExt && isUInt<16>(CI->getZExtValue())) {
-    unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
-    unsigned ImmReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
-        .addImm(CI->getZExtValue());
+        .addImm(Imm);
     return ImmReg;
   }
 
   // Construct the constant piecewise.
-  int64_t Imm = CI->getZExtValue();
-
   if (VT == MVT::i64)
     return PPCMaterialize64BitInt(Imm, RC);
   else if (VT == MVT::i32)
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index cb0271f..5367468 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -736,7 +736,7 @@ def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
 def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
                           v16i8, v8i16>;
 def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
-                          v16i8, v4i32>;
+                          v8i16, v4i32>;
 def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
                           v8i16, v4i32>;
 def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index ee73267..b0a6127 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1849,7 +1849,7 @@ static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
     if (CCMask == SystemZ::CCMASK_CMP_NE)
       return SystemZ::CCMASK_TM_SOME_1;
   }
-  if (EffectivelyUnsigned && CmpVal <= Low) {
+  if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
     if (CCMask == SystemZ::CCMASK_CMP_LT)
       return SystemZ::CCMASK_TM_ALL_0;
     if (CCMask == SystemZ::CCMASK_CMP_GE)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 34f3919..c12a3ed 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1335,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
+    setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
     setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
@@ -14975,8 +14976,11 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
-  return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(),
-                     DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
+  SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                              DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
+  if (Op.getSimpleValueType() == MVT::i1)
+      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+  return SetCC;
 }
 
 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
@@ -16315,6 +16319,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget *Subtarget,
                            SelectionDAG &DAG, SDLoc dl) {
 
+  if (isAllOnesConstant(Mask))
+    return DAG.getTargetConstant(1, dl, MaskVT);
+  if (X86::isZeroNode(Mask))
+    return DAG.getTargetConstant(0, dl, MaskVT);
+
   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
     // Mask should be extended
     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
@@ -17203,26 +17212,14 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg;
-  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
-  if (MaskC)
-    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
-  else {
-    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                     Mask.getSimpleValueType().getSizeInBits());
 
-    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
-    // are extracted by EXTRACT_SUBVECTOR.
-    MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                            DAG.getBitcast(BitcastVT, Mask),
-                            DAG.getIntPtrConstant(0, dl));
-  }
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   if (Src.getOpcode() == ISD::UNDEF)
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
-  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   return DAG.getMergeValues(RetOps, dl);
@@ -17230,7 +17227,8 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
 
 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Src, SDValue Mask, SDValue Base,
-                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+                               SDValue Index, SDValue ScaleOp, SDValue Chain,
+                               const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = cast<ConstantSDNode>(ScaleOp);
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
@@ -17238,29 +17236,18 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg;
-  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
-  if (MaskC)
-    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
-  else {
-    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                     Mask.getSimpleValueType().getSizeInBits());
 
-    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
-    // are extracted by EXTRACT_SUBVECTOR.
-    MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                            DAG.getBitcast(BitcastVT, Mask),
-                            DAG.getIntPtrConstant(0, dl));
-  }
+  SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
-  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   return SDValue(Res, 1);
 }
 
 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Mask, SDValue Base, SDValue Index,
-                               SDValue ScaleOp, SDValue Chain) {
+                               SDValue ScaleOp, SDValue Chain,
+                               const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = cast<ConstantSDNode>(ScaleOp);
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
@@ -17268,14 +17255,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT =
     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg;
-  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
-  if (MaskC)
-    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
-  else
-    MaskInReg = DAG.getBitcast(MaskVT, Mask);
+  SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
   //SDVTList VTs = DAG.getVTList(MVT::Other);
-  SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   return SDValue(Res, 0);
 }
@@ -17509,7 +17491,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     SDValue Src   = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
-                          Scale, Chain);
+                          Scale, Chain, *Subtarget);
   }
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
@@ -17521,7 +17503,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     SDValue Index = Op.getOperand(3);
     SDValue Base  = Op.getOperand(4);
     SDValue Scale = Op.getOperand(5);
-    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
+    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
+                           *Subtarget);
   }
   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   case RDTSC: {
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c0786af..d9311a3 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -3560,7 +3560,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                                 BO1->getOperand(0));
           }
 
-          if (CI->isMaxValue(true)) {
+          if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
             ICmpInst::Predicate Pred = I.isSigned()
                                            ? I.getUnsignedPredicate()
                                            : I.getSignedPredicate();
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 47406b9..dd2889d 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -557,7 +557,8 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
         ConstantInt::get(IdxType, i),
       };
       auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName);
-      auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName);
+      auto *L = IC.Builder->CreateAlignedLoad(Ptr, LI.getAlignment(),
+                                              LoadName);
       V = IC.Builder->CreateInsertValue(V, L, i);
     }
 
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 5cde31a..bc4c0eb 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -380,6 +380,23 @@ static void replaceExtractElements(InsertElementInst *InsElt,
     ExtendMask.push_back(UndefValue::get(IntType));
 
   Value *ExtVecOp = ExtElt->getVectorOperand();
+  auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
+  BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
+                                   ? ExtVecOpInst->getParent()
+                                   : ExtElt->getParent();
+
+  // TODO: This restriction matches the basic block check below when creating
+  // new extractelement instructions. If that limitation is removed, this one
+  // could also be removed. But for now, we just bail out to ensure that we
+  // will replace the extractelement instruction that is feeding our
+  // insertelement instruction. This allows the insertelement to then be
+  // replaced by a shufflevector. If the insertelement is not replaced, we can
+  // induce infinite looping because there's an optimization for extractelement
+  // that will delete our widening shuffle. This would trigger another attempt
+  // here to create that shuffle, and we spin forever.
+  if (InsertionBlock != InsElt->getParent())
+    return;
+
   auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
                                         ConstantVector::get(ExtendMask));
 
@@ -387,7 +404,6 @@ static void replaceExtractElements(InsertElementInst *InsElt,
   // (as long as it's not a PHI) or at the start of the basic block of the
   // extract, so any subsequent extracts in the same basic block can use it.
   // TODO: Insert before the earliest ExtractElementInst that is replaced.
-  auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
   if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
     WideVec->insertAfter(ExtVecOpInst);
   else
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 3125a2c..e484b69 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -90,6 +90,11 @@ static cl::opt<bool> SpeculateOneExpensiveInst(
     cl::desc("Allow exactly one expensive instruction to be speculatively "
              "executed"));
 
+static cl::opt<unsigned> MaxSpeculationDepth(
+    "max-speculation-depth", cl::Hidden, cl::init(10),
+    cl::desc("Limit maximum recursion depth when calculating costs of "
+             "speculatively executed instructions"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");
 STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
@@ -269,6 +274,13 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
                                 unsigned &CostRemaining,
                                 const TargetTransformInfo &TTI,
                                 unsigned Depth = 0) {
+  // It is possible to hit a zero-cost cycle (phi/gep instructions for example),
+  // so limit the recursion depth.
+  // TODO: While this recursion limit does prevent pathological behavior, it
+  // would be better to track visited instructions to avoid cycles.
+  if (Depth == MaxSpeculationDepth)
+    return false;
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
     // Non-instructions all dominate instructions, but not all constantexprs
diff --git a/test/Analysis/DemandedBits/basic.ll b/test/Analysis/DemandedBits/basic.ll
index 9973edf..3fd1b32 100644
--- a/test/Analysis/DemandedBits/basic.ll
+++ b/test/Analysis/DemandedBits/basic.ll
@@ -10,34 +10,3 @@ define i8 @test_mul(i32 %a, i32 %b) {
   %3 = trunc i32 %2 to i8
   ret i8 %3
 }
-
-; CHECK-LABEL: 'test_icmp1'
-; CHECK-DAG: DemandedBits: 0x1 for   %3 = icmp eq i32 %1, %2
-; CHECK-DAG: DemandedBits: 0xFFF for   %1 = and i32 %a, 255
-; CHECK-DAG: DemandedBits: 0xFFF for   %2 = shl i32 %1, 4
-define i1 @test_icmp1(i32 %a, i32 %b) {
-  %1 = and i32 %a, 255
-  %2 = shl i32 %1, 4
-  %3 = icmp eq i32 %1, %2
-  ret i1 %3
-}
-
-; CHECK-LABEL: 'test_icmp2'
-; CHECK-DAG: DemandedBits: 0x1 for   %3 = icmp eq i32 %1, %2
-; CHECK-DAG: DemandedBits: 0xFFF for   %1 = and i32 %a, 255
-; CHECK-DAG: DemandedBits: 0xFF for   %2 = ashr i32 %1, 4
-define i1 @test_icmp2(i32 %a, i32 %b) {
-  %1 = and i32 %a, 255
-  %2 = ashr i32 %1, 4
-  %3 = icmp eq i32 %1, %2
-  ret i1 %3
-}
-
-; CHECK-LABEL: 'test_icmp3'
-; CHECK-DAG: DemandedBits: 0xFFFFFFFF for   %1 = and i32 %a, 255
-; CHECK-DAG: DemandedBits: 0x1 for   %2 = icmp eq i32 -1, %1
-define i1 @test_icmp3(i32 %a) {
-  %1 = and i32 %a, 255
-  %2 = icmp eq i32 -1, %1
-  ret i1 %2
-}
diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
index f6e4bdf..b892f19 100644
--- a/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -267,4 +267,278 @@ define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
   ret <4 x i16> %1
 }
 
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_une:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, ne
+; CHECK-DAG: csel {{.*}}, wzr, ne
+; CHECK-DAG: csel {{.*}}, wzr, ne
+; CHECK-DAG: csel {{.*}}, wzr, ne
+define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp une <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, eq
+; CHECK-DAG: csel {{.*}}, wzr, eq
+; CHECK-DAG: csel {{.*}}, wzr, eq
+; CHECK-DAG: csel {{.*}}, wzr, eq
+; CHECK-DAG: csel {{.*}}, vs
+; CHECK-DAG: csel {{.*}}, vs
+; CHECK-DAG: csel {{.*}}, vs
+; CHECK-DAG: csel {{.*}}, vs
+define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp ueq <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, hi
+; CHECK-DAG: csel {{.*}}, wzr, hi
+; CHECK-DAG: csel {{.*}}, wzr, hi
+; CHECK-DAG: csel {{.*}}, wzr, hi
+define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp ugt <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, pl
+; CHECK-DAG: csel {{.*}}, wzr, pl
+; CHECK-DAG: csel {{.*}}, wzr, pl
+; CHECK-DAG: csel {{.*}}, wzr, pl
+define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp uge <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, lt
+; CHECK-DAG: csel {{.*}}, wzr, lt
+; CHECK-DAG: csel {{.*}}, wzr, lt
+; CHECK-DAG: csel {{.*}}, wzr, lt
+define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp ult <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, le
+; CHECK-DAG: csel {{.*}}, wzr, le
+; CHECK-DAG: csel {{.*}}, wzr, le
+; CHECK-DAG: csel {{.*}}, wzr, le
+define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp ule <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, vs
+; CHECK-DAG: csel {{.*}}, wzr, vs
+; CHECK-DAG: csel {{.*}}, wzr, vs
+; CHECK-DAG: csel {{.*}}, wzr, vs
+define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp uno <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_one:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, mi
+; CHECK-DAG: csel {{.*}}, wzr, mi
+; CHECK-DAG: csel {{.*}}, wzr, mi
+; CHECK-DAG: csel {{.*}}, wzr, mi
+; CHECK-DAG: csel {{.*}}, gt
+; CHECK-DAG: csel {{.*}}, gt
+; CHECK-DAG: csel {{.*}}, gt
+; CHECK-DAG: csel {{.*}}, gt
+define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp one <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, eq
+; CHECK-DAG: csel {{.*}}, wzr, eq
+; CHECK-DAG: csel {{.*}}, wzr, eq
+; CHECK-DAG: csel {{.*}}, wzr, eq
+define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp oeq <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, gt
+; CHECK-DAG: csel {{.*}}, wzr, gt
+; CHECK-DAG: csel {{.*}}, wzr, gt
+; CHECK-DAG: csel {{.*}}, wzr, gt
+define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp ogt <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, ge
+; CHECK-DAG: csel {{.*}}, wzr, ge
+; CHECK-DAG: csel {{.*}}, wzr, ge
+; CHECK-DAG: csel {{.*}}, wzr, ge
+define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp oge <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, mi
+; CHECK-DAG: csel {{.*}}, wzr, mi
+; CHECK-DAG: csel {{.*}}, wzr, mi
+; CHECK-DAG: csel {{.*}}, wzr, mi
+define <4 x i1> @test_fcmp_olt(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp olt <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, ls
+; CHECK-DAG: csel {{.*}}, wzr, ls
+; CHECK-DAG: csel {{.*}}, wzr, ls
+; CHECK-DAG: csel {{.*}}, wzr, ls
+define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp ole <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
+; Function Attrs: nounwind readnone
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: csel {{.*}}, wzr, vc
+; CHECK-DAG: csel {{.*}}, wzr, vc
+; CHECK-DAG: csel {{.*}}, wzr, vc
+; CHECK-DAG: csel {{.*}}, wzr, vc
+define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 {
+  %1 = fcmp ord <4 x half> %a, %b
+  ret <4 x i1> %1
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
index 137d1f3..2f70f36 100644
--- a/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -421,4 +421,88 @@ define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
   ret <8 x i16> %1
 }
 
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp une <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 16 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp ueq <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp ugt <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp uge <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp ult <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp ule <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp uno <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp one <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp oeq <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp ogt <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp oge <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp olt <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp ole <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
+; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests.  Skipped.
+define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 {
+  %1 = fcmp ord <8 x half> %a, %b
+  ret <8 x i1> %1
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index 0e46622..f82e98e 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=HSA --check-prefix=HSA-FIJI %s
 
 ; HSA: .hsa_code_object_version 1,0
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
+; HSA-FIJI: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
index 3d05da6..fdc3240 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
@@ -1,5 +1,6 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
 ;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
+;RUN: llc < %s -march=amdgcn -mcpu=stoney -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
 
 ;GCN-LABEL: {{^}}main:
diff --git a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
new file mode 100644
index 0000000..4a12ed5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+
+; When the offset of VGPR spills into scratch space gets too large, an additional SGPR
+; is used to calculate the scratch load/store address. Make sure that this
+; mechanism works even when many spills happen.
+
+; Just test that it compiles successfully.
+; CHECK-LABEL: test
+define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in,
+                  <96 x i32> addrspace(1)* %sdata_out, <96 x i32> %sdata_in) {
+entry:
+  %tid = call i32 @llvm.SI.tid() nounwind readnone
+
+  %aptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
+  %a = load <1280 x i32>, <1280 x i32> addrspace(1)* %aptr
+
+; mark most VGPR registers as used to increase register pressure
+  call void asm sideeffect "", "~{VGPR4},~{VGPR8},~{VGPR12},~{VGPR16},~{VGPR20},~{VGPR24},~{VGPR28},~{VGPR32}" ()
+  call void asm sideeffect "", "~{VGPR36},~{VGPR40},~{VGPR44},~{VGPR48},~{VGPR52},~{VGPR56},~{VGPR60},~{VGPR64}" ()
+  call void asm sideeffect "", "~{VGPR68},~{VGPR72},~{VGPR76},~{VGPR80},~{VGPR84},~{VGPR88},~{VGPR92},~{VGPR96}" ()
+  call void asm sideeffect "", "~{VGPR100},~{VGPR104},~{VGPR108},~{VGPR112},~{VGPR116},~{VGPR120},~{VGPR124},~{VGPR128}" ()
+  call void asm sideeffect "", "~{VGPR132},~{VGPR136},~{VGPR140},~{VGPR144},~{VGPR148},~{VGPR152},~{VGPR156},~{VGPR160}" ()
+  call void asm sideeffect "", "~{VGPR164},~{VGPR168},~{VGPR172},~{VGPR176},~{VGPR180},~{VGPR184},~{VGPR188},~{VGPR192}" ()
+  call void asm sideeffect "", "~{VGPR196},~{VGPR200},~{VGPR204},~{VGPR208},~{VGPR212},~{VGPR216},~{VGPR220},~{VGPR224}" ()
+
+  %outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
+  store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr
+
+  ret void
+}
+
+declare i32 @llvm.SI.tid() nounwind readnone
diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll
index 5d44eb0..e5f9b11 100644
--- a/test/CodeGen/ARM/shifter_operand.ll
+++ b/test/CodeGen/ARM/shifter_operand.ll
@@ -239,3 +239,20 @@ define void @test_well_formed_dag(i32 %in1, i32 %in2, i32* %addr) {
   store i32 %add, i32* %addr
   ret void
 }
+
+define { i32, i32 } @test_multi_use_add(i32 %base, i32 %offset) {
+; CHECK-LABEL: test_multi_use_add:
+; CHECK-THUMB: movs [[CONST:r[0-9]+]], #28
+; CHECK-THUMB: movt [[CONST]], #1
+
+  %prod = mul i32 %offset, 65564
+  %sum = add i32 %base, %prod
+
+  %ptr = inttoptr i32 %sum to i32*
+  %loaded = load i32, i32* %ptr
+
+  %ret.tmp = insertvalue { i32, i32 } undef, i32 %sum, 0
+  %ret = insertvalue { i32, i32 } %ret.tmp, i32 %loaded, 1
+
+  ret { i32, i32 } %ret
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-ret.ll b/test/CodeGen/PowerPC/fast-isel-ret.ll
index e05ef7d..0adb5a9 100644
--- a/test/CodeGen/PowerPC/fast-isel-ret.ll
+++ b/test/CodeGen/PowerPC/fast-isel-ret.ll
@@ -186,3 +186,12 @@ entry:
 ; ELF64: blr
   ret i32 -1
 }
+
+define zeroext i16 @ret20() nounwind {
+entry:
+; ELF64-LABEL: ret20
+; ELF64: lis{{.*}}0
+; ELF64: ori{{.*}}32768
+; ELF64: blr
+  ret i16 32768
+}
diff --git a/test/CodeGen/PowerPC/inline-asm-s-modifier.ll b/test/CodeGen/PowerPC/inline-asm-s-modifier.ll
new file mode 100644
index 0000000..c8b00b6
--- /dev/null
+++ b/test/CodeGen/PowerPC/inline-asm-s-modifier.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+define void @test() {
+entry:
+  call void asm sideeffect "mtfsb1 ${0:s}", "i"(i32 7), !srcloc !1
+  ret void
+}
+; CHECK: #APP
+; CHECK-NEXT: mtfsb1 25
+
+!1 = !{i32 40}
diff --git a/test/CodeGen/PowerPC/pr26193.ll b/test/CodeGen/PowerPC/pr26193.ll
new file mode 100644
index 0000000..acd99bc
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr26193.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+define <8 x i16> @test(<4 x i32> %a) {
+entry:
+  %0 = tail call <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32> %a, <4 x i32> %a)
+  ret <8 x i16> %0
+}
+; CHECK: vpkswss 2,
+
+declare <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32>, <4 x i32>)
diff --git a/test/CodeGen/PowerPC/pr26356.ll b/test/CodeGen/PowerPC/pr26356.ll
new file mode 100644
index 0000000..0f5d877
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr26356.ll
@@ -0,0 +1,136 @@
+; RUN: llc -O0 -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+
+define zeroext i32 @f1() {
+entry:
+  ret i32 65535
+}
+; CHECK-LABEL: @f1
+; CHECK: lis 3, 0
+; CHECK: ori 3, 3, 65535
+
+define zeroext i32 @f2() {
+entry:
+  ret i32 32768
+}
+; CHECK-LABEL: @f2
+; CHECK: lis 3, 0
+; CHECK: ori 3, 3, 32768
+
+define zeroext i32 @f3() {
+entry:
+  ret i32 32767
+}
+; CHECK-LABEL: @f3
+; CHECK: li 3, 32767
+
+define zeroext i16 @f4() {
+entry:
+  ret i16 65535
+}
+; CHECK-LABEL: @f4
+; CHECK: lis 3, 0
+; CHECK: ori 3, 3, 65535
+
+define zeroext i16 @f5() {
+entry:
+  ret i16 32768
+}
+; CHECK-LABEL: @f5
+; CHECK: lis 3, 0
+; CHECK: ori 3, 3, 32768
+
+define zeroext i16 @f6() {
+entry:
+  ret i16 32767
+}
+; CHECK-LABEL: @f6
+; CHECK: li 3, 32767
+
+define zeroext i16 @f7() {
+entry:
+  ret i16 -1
+}
+; CHECK-LABEL: @f7
+; CHECK: lis 3, 0
+; CHECK: ori 3, 3, 65535
+
+define zeroext i16 @f8() {
+entry:
+  ret i16 -32768
+}
+; CHECK-LABEL: @f8
+; CHECK: lis 3, 0
+; CHECK: ori 3, 3, 32768
+
+define signext i32 @f1s() {
+entry:
+  ret i32 65535
+}
+; CHECK-LABEL: @f1s
+; CHECK: lis 3, 0
+; CHECK: ori 3, 3, 65535
+
+define signext i32 @f2s() {
+entry:
+  ret i32 32768
+}
+; CHECK-LABEL: @f2s
+; CHECK: lis 3, 0
+; CHECK: ori 3, 3, 32768
+
+define signext i32 @f3s() {
+entry:
+  ret i32 32767
+}
+; CHECK-LABEL: @f3s
+; CHECK: li 3, 32767
+
+define signext i16 @f4s() {
+entry:
+  ret i16 32767
+}
+; CHECK-LABEL: @f4s
+; CHECK: li 3, 32767
+
+define signext i32 @f1sn() {
+entry:
+  ret i32 -65535
+}
+; CHECK-LABEL: @f1sn
+; CHECK: lis 3, -1
+; CHECK: ori 3, 3, 1
+
+define signext i32 @f2sn() {
+entry:
+  ret i32 -32768
+}
+; CHECK-LABEL: @f2sn
+; CHECK: li 3, -32768
+
+define signext i32 @f3sn() {
+entry:
+  ret i32 -32767
+}
+; CHECK-LABEL: @f3sn
+; CHECK: li 3, -32767
+
+define signext i32 @f4sn() {
+entry:
+  ret i32 -65536
+}
+; CHECK-LABEL: @f4sn
+; CHECK: lis 3, -1
+
+define signext i16 @f5sn() {
+entry:
+  ret i16 -32767
+}
+; CHECK-LABEL: @f5sn
+; CHECK: li 3, -32767
+
+define signext i16 @f6sn() {
+entry:
+  ret i16 -32768
+}
+; CHECK-LABEL: @f6sn
+; CHECK: li 3, -32768
diff --git a/test/CodeGen/PowerPC/pr26381.ll b/test/CodeGen/PowerPC/pr26381.ll
new file mode 100644
index 0000000..a45288e
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr26381.ll
@@ -0,0 +1,8 @@
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown -O0 < %s | FileCheck %s
+
+define internal signext i32 @foo() #0 {
+  ret i32 -125452974
+}
+
+; CHECK: lis 3, -1915
+; CHECK: ori 3, 3, 48466
diff --git a/test/CodeGen/SystemZ/int-cmp-53.ll b/test/CodeGen/SystemZ/int-cmp-53.ll
new file mode 100644
index 0000000..b7d985e
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-53.ll
@@ -0,0 +1,26 @@
+; This used to incorrectly use a TMLL for an always-false test at -O0.
+;
+; RUN: llc -O0 < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @test(i8 *%input, i32 *%result) {
+entry:
+; CHECK-NOT: tmll
+
+  %0 = load i8, i8* %input, align 1
+  %1 = trunc i8 %0 to i1
+  %2 = zext i1 %1 to i32
+  %3 = icmp sge i32 %2, 0
+  br i1 %3, label %if.then, label %if.else
+
+if.then:
+  store i32 1, i32* %result, align 4
+  br label %return
+
+if.else:
+  store i32 0, i32* %result, align 4
+  br label %return
+
+return:
+  ret void
+}
+
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 3bc67cc..9ba1819 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -259,18 +259,22 @@ define void @prefetch(<8 x i64> %ind, i8* %base) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
+; CHECK-NEXT:    kxorw %k0, %k0, %k1
 ; CHECK-NEXT:    vgatherpf1qps (%rdi,%zmm0,4) {%k1}
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    kmovb %eax, %k1
 ; CHECK-NEXT:    vscatterpf0qps (%rdi,%zmm0,2) {%k1}
+; CHECK-NEXT:    movb $120, %al
+; CHECK-NEXT:    kmovb %eax, %k1
 ; CHECK-NEXT:    vscatterpf1qps (%rdi,%zmm0,2) {%k1}
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
+  call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 1)
   ret void
 }
 
-
 declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
 
 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
@@ -790,3 +794,54 @@ define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <
   ret void
 }
 
+define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
+; CHECK-LABEL: scatter_mask_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
+; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    kmovb %eax, %k1
+; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
+; CHECK-NEXT:    movb $96, %al
+; CHECK-NEXT:    kmovb %eax, %k1
+; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
+  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
+  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
+  call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
+  ret void
+}
+
+define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base)  {
+; CHECK-LABEL: gather_mask_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm2
+; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
+; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm3
+; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
+; CHECK-NEXT:    movw $1, %ax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vmovaps %zmm1, %zmm4
+; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
+; CHECK-NEXT:    movw $220, %ax
+; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
+; CHECK-NEXT:    vaddps %zmm4, %zmm1, %zmm1
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
+  %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4)
+  %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4)
+
+  %res4 = fadd <16 x float> %res, %res1
+  %res5 = fadd <16 x float> %res3, %res2
+  %res6 = fadd <16 x float> %res5, %res4
+  ret <16 x float> %res6
+}
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 77739e7..91b42bd 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -1,26 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s   | FileCheck %s --check-prefix=KNL-32
+
 
 ; Verify that we don't crash during codegen due to a wrong lowering
 ; of a setcc node with illegal operand types and return type.
 
 define <8 x i16> @pr25080(<8 x i32> %a) {
-; CHECK-LABEL: pr25080:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; CHECK-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
-; CHECK-NEXT:    vpsraw $15, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX-LABEL: pr25080:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 entry:
   %0 = trunc <8 x i32> %a to <8 x i23>
   %1 = icmp eq <8 x i23> %0, zeroinitializer
@@ -28,3 +30,46 @@ entry:
   %3 = sext <8 x i1> %2 to <8 x i16>
   ret <8 x i16> %3
 }
+
+define void @pr26232(i64 %a) {
+; KNL-32-LABEL: pr26232:
+; KNL-32:       # BB#0: # %for_test11.preheader
+; KNL-32-NEXT:    pushl %esi
+; KNL-32-NEXT:  .Ltmp0:
+; KNL-32-NEXT:    .cfi_def_cfa_offset 8
+; KNL-32-NEXT:  .Ltmp1:
+; KNL-32-NEXT:    .cfi_offset %esi, -8
+; KNL-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; KNL-32-NEXT:    movw $-1, %dx
+; KNL-32-NEXT:    .align 16, 0x90
+; KNL-32-NEXT:  .LBB1_1: # %for_loop599
+; KNL-32-NEXT:    # =>This Inner Loop Header: Depth=1
+; KNL-32-NEXT:    cmpl $65536, %ecx # imm = 0x10000
+; KNL-32-NEXT:    movl %eax, %esi
+; KNL-32-NEXT:    sbbl $0, %esi
+; KNL-32-NEXT:    movl $0, %esi
+; KNL-32-NEXT:    cmovlw %dx, %si
+; KNL-32-NEXT:    testw %si, %si
+; KNL-32-NEXT:    jne .LBB1_1
+; KNL-32-NEXT:  # BB#2: # %for_exit600
+; KNL-32-NEXT:    popl %esi
+; KNL-32-NEXT:    retl
+allocas:
+  br label %for_test11.preheader
+
+for_test11.preheader:                             ; preds = %for_test11.preheader, %allocas
+  br i1 undef, label %for_loop599, label %for_test11.preheader
+
+for_loop599:                                      ; preds = %for_loop599, %for_test11.preheader
+  %less_i_load605_ = icmp slt i64 %a, 65536
+  %less_i_load605__broadcast_init = insertelement <16 x i1> undef, i1 %less_i_load605_, i32 0
+  %less_i_load605__broadcast = shufflevector <16 x i1> %less_i_load605__broadcast_init, <16 x i1> undef, <16 x i32> zeroinitializer
+  %"oldMask&test607" = and <16 x i1> %less_i_load605__broadcast, undef
+  %intmask.i894 = bitcast <16 x i1> %"oldMask&test607" to i16
+  %res.i895 = icmp eq i16 %intmask.i894, 0
+  br i1 %res.i895, label %for_exit600, label %for_loop599
+
+for_exit600:                                      ; preds = %for_loop599
+  ret void
+}
diff --git a/test/DebugInfo/X86/PR26148.ll b/test/DebugInfo/X86/PR26148.ll
new file mode 100644
index 0000000..b552508
--- /dev/null
+++ b/test/DebugInfo/X86/PR26148.ll
@@ -0,0 +1,102 @@
+; RUN: llc -filetype=obj -o - < %s | llvm-dwarfdump - | FileCheck %s
+;
+; Created using clang -g -O3 from:
+; struct S0 {
+;  short f0;
+;  int f3;
+; } a;
+; void fn1(short p1) {
+;  struct S0 b, c = {3};
+;  b.f3 = p1;
+;  a = b = c;
+; }
+; 
+; int main() { return 0; }
+;
+; This is similar to the bug in test/DebugInfo/ARM/PR26163.ll, except that there is an
+; extra non-overlapping range first. Thus, we make sure that the backend actually looks
+; at all expressions when determining whether to merge ranges, not just the first one.
+; AS in 26163, we expect two ranges (as opposed to one), the first one being zero sized
+;
+;
+; CHECK: 0x00000000: Beginning address offset: 0x0000000000000004
+; CHECK:                Ending address offset: 0x0000000000000004
+; CHECK:                 Location description: 10 03 55 93 04
+; CHECK:             Beginning address offset: 0x0000000000000004
+; CHECK:                Ending address offset: 0x0000000000000014
+; CHECK:                 Location description: 10 03 10 00
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+%struct.S0 = type { i16, i32 }
+
+@a = common global %struct.S0 zeroinitializer, align 4
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+; The attributes are here to force the zero-sized range not to be at the start of
+; the function, which has special interpretation in DWARF. The fact that this happens
+; at all is probably an LLVM bug.
+attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
+define void @fn1(i16 signext %p1) #0 !dbg !4 {
+entry:
+  tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !9, metadata !26), !dbg !27
+  tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !10, metadata !26), !dbg !28
+  tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !16, metadata !26), !dbg !29
+  tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !16, metadata !30), !dbg !29
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !31), !dbg !29
+  tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !10, metadata !32), !dbg !28
+  tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !10, metadata !30), !dbg !28
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !10, metadata !31), !dbg !28
+  store i32 3, i32* bitcast (%struct.S0* @a to i32*), align 4, !dbg !33
+  store i32 0, i32* getelementptr inbounds (%struct.S0, %struct.S0* @a, i64 0, i32 1), align 4, !dbg !33
+  ret void, !dbg !34
+}
+
+define i32 @main() !dbg !17 {
+entry:
+  ret i32 0, !dbg !35
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22, !23, !24}
+!llvm.ident = !{!25}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3, globals: !20)
+!1 = !DIFile(filename: "small.c", directory: "/Users/kfischer/Projects/clangbug")
+!2 = !{}
+!3 = !{!4, !17}
+!4 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 5, type: !5, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, variables: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7}
+!7 = !DIBasicType(name: "short", size: 16, align: 16, encoding: DW_ATE_signed)
+!8 = !{!9, !10, !16}
+!9 = !DILocalVariable(name: "p1", arg: 1, scope: !4, file: !1, line: 5, type: !7)
+!10 = !DILocalVariable(name: "b", scope: !4, file: !1, line: 6, type: !11)
+!11 = !DICompositeType(tag: DW_TAG_structure_type, name: "S0", file: !1, line: 1, size: 64, align: 32, elements: !12)
+!12 = !{!13, !14}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "f0", scope: !11, file: !1, line: 2, baseType: !7, size: 16, align: 16)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "f3", scope: !11, file: !1, line: 3, baseType: !15, size: 32, align: 32, offset: 32)
+!15 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!16 = !DILocalVariable(name: "c", scope: !4, file: !1, line: 6, type: !11)
+!17 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 11, type: !18, isLocal: false, isDefinition: true, scopeLine: 11, isOptimized: true, variables: !2)
+!18 = !DISubroutineType(types: !19)
+!19 = !{!15}
+!20 = !{!21}
+!21 = !DIGlobalVariable(name: "a", scope: !0, file: !1, line: 4, type: !11, isLocal: false, isDefinition: true, variable: %struct.S0* @a)
+!22 = !{i32 2, !"Dwarf Version", i32 2}
+!23 = !{i32 2, !"Debug Info Version", i32 3}
+!24 = !{i32 1, !"PIC Level", i32 2}
+!25 = !{!"clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)"}
+!26 = !DIExpression()
+!27 = !DILocation(line: 5, column: 16, scope: !4)
+!28 = !DILocation(line: 6, column: 13, scope: !4)
+!29 = !DILocation(line: 6, column: 16, scope: !4)
+!30 = !DIExpression(DW_OP_bit_piece, 0, 32)
+!31 = !DIExpression(DW_OP_bit_piece, 32, 32)
+!32 = !DIExpression(DW_OP_bit_piece, 32, 16)
+!33 = !DILocation(line: 8, column: 9, scope: !4)
+!34 = !DILocation(line: 9, column: 1, scope: !4)
+!35 = !DILocation(line: 11, column: 14, scope: !17)
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 7d6ec96..1e64cd7 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -1672,3 +1672,15 @@ define i1 @cmp_slt_rhs_inc(float %x, i32 %i) {
   %cmp = icmp slt i32 %conv, %inc
   ret i1 %cmp
 }
+
+; CHECK-LABEL: @PR26407
+; CHECK-NEXT: %[[addx:.*]] = add i32 %x, 2147483647
+; CHECK-NEXT: %[[addy:.*]] = add i32 %y, 2147483647
+; CHECK-NEXT: %[[cmp:.*]] = icmp uge i32 %[[addx]], %[[addy]]
+; CHECK-NEXT: ret i1 %[[cmp]]
+define i1 @PR26407(i32 %x, i32 %y) {
+  %addx = add i32 %x, 2147483647
+  %addy = add i32 %y, 2147483647
+  %cmp = icmp uge i32 %addx, %addy
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index 47c2a13..8ed4db8 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -175,3 +175,33 @@ bb3:
   ret <4 x double> %tmp4
 }
 
+; PR26354: https://llvm.org/bugs/show_bug.cgi?id=26354
+; Don't create a shufflevector if we know that we're not going to replace the insertelement.
+
+define double @pr26354(<2 x double>* %tmp, i1 %B) {
+; CHECK-LABEL: @pr26354(
+; CHECK:       %ld = load <2 x double>, <2 x double>* %tmp
+; CHECK-NEXT:  %e1 = extractelement <2 x double> %ld, i32 0
+; CHECK-NEXT:  br i1 %B, label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:  %e2 = extractelement <2 x double> %ld, i32 1
+; CHECK-NEXT:  %i1 = insertelement <4 x double>
+; CHECK-NEXT:  br label %end
+
+entry:
+  %ld = load <2 x double>, <2 x double>* %tmp
+  %e1 = extractelement <2 x double> %ld, i32 0
+  %e2 = extractelement <2 x double> %ld, i32 1
+  br i1 %B, label %if, label %end
+
+if:
+  %i1 = insertelement <4 x double> zeroinitializer, double %e2, i32 3
+  br label %end
+
+end:
+  %ph = phi <4 x double> [ undef, %entry ], [ %i1, %if ]
+  %e3 = extractelement <4 x double> %ph, i32 1
+  %mu = fmul double %e1, %e3
+  ret double %mu
+}
+
diff --git a/test/Transforms/InstCombine/unpack-fca.ll b/test/Transforms/InstCombine/unpack-fca.ll
index 9b8d104..4359839 100644
--- a/test/Transforms/InstCombine/unpack-fca.ll
+++ b/test/Transforms/InstCombine/unpack-fca.ll
@@ -136,3 +136,18 @@ define %B @structB(%B* %b.ptr) {
   %1 = load %B, %B* %b.ptr, align 8
   ret %B %1
 }
+
+%struct.S = type <{ i8, %struct.T }>
+%struct.T = type { i32, i32 }
+
+; Make sure that we do not increase alignment of packed struct element
+define i32 @packed_alignment(%struct.S* dereferenceable(9) %s) {
+; CHECK-LABEL: packed_alignment
+; CHECK-NEXT: %tv.elt1 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 1, i32 1
+; CHECK-NEXT: %tv.unpack2 = load i32, i32* %tv.elt1, align 1
+; CHECK-NEXT: ret i32 %tv.unpack2
+  %t = getelementptr inbounds %struct.S, %struct.S* %s, i32 0, i32 1
+  %tv = load %struct.T, %struct.T* %t, align 1
+  %v = extractvalue %struct.T %tv, 1
+  ret i32 %v
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index eee3104..51f899c 100644
--- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -205,39 +205,5 @@ for.body:                                         ; preds = %for.body, %for.body
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: @add_g
-; CHECK: load <16 x i8>
-; CHECK: xor <16 x i8>
-; CHECK: icmp ult <16 x i8>
-; CHECK: select <16 x i1> {{.*}}, <16 x i8>
-; CHECK: store <16 x i8>
-define void @add_g(i8* noalias nocapture readonly %p, i8* noalias nocapture readonly %q, i8* noalias nocapture %r, i8 %arg1, i32 %len) #0 {
-  %1 = icmp sgt i32 %len, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0
-  %2 = sext i8 %arg1 to i64
-  br label %3
-
-._crit_edge:                                      ; preds = %3, %0
-  ret void
-
-; <label>:3                                       ; preds = %3, %.lr.ph
-  %indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %3 ]
-  %x4 = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
-  %x5 = load i8, i8* %x4
-  %x7 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
-  %x8 = load i8, i8* %x7
-  %x9 = zext i8 %x5 to i32
-  %x10 = xor i32 %x9, 255
-  %x11 = icmp ult i32 %x10, 24
-  %x12 = select i1 %x11, i32 %x10, i32 24
-  %x13 = trunc i32 %x12 to i8
-  store i8 %x13, i8* %x4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %len
-  br i1 %exitcond, label %._crit_edge, label %3
-}
 
 attributes #0 = { nounwind }
diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index cae1a91..6953cf9 100644
--- a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -1302,3 +1302,35 @@ l6:
 ; CHECK: entry
 ; CHECK-NEXT: switch
 }
+
+; Speculation depth must be limited to avoid a zero-cost instruction cycle.
+
+; CHECK-LABEL: @PR26308(
+; CHECK:       cleanup4:
+; CHECK-NEXT:  br label %cleanup4
+
+define i32 @PR26308(i1 %B, i64 %load) {
+entry:
+  br label %while.body
+
+while.body:
+  br label %cleanup
+
+cleanup:
+  %cleanup.dest.slot.0 = phi i1 [ false, %while.body ]
+  br i1 %cleanup.dest.slot.0, label %for.cond, label %cleanup4
+
+for.cond:
+  %e.0 = phi i64* [ undef, %cleanup ], [ %incdec.ptr, %for.cond2 ]
+  %pi = ptrtoint i64* %e.0 to i64
+  %incdec.ptr = getelementptr inbounds i64, i64* %e.0, i64 1
+  br label %for.cond2
+
+for.cond2:
+  %storemerge = phi i64 [ %pi, %for.cond ], [ %load, %for.cond2 ]
+  br i1 %B, label %for.cond2, label %for.cond
+
+cleanup4:
+  br label %while.body
+}
+
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 9b89d87..f50382b 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -25,7 +25,7 @@ if(NOT LLVM_USE_INTEL_JITEVENTS )
   set(LLVM_TOOL_LLVM_JITLISTENER_BUILD Off)
 endif()
 
-if(CYGWIN)
+if(CYGWIN OR NOT LLVM_ENABLE_PIC)
   set(LLVM_TOOL_LTO_BUILD Off)
   set(LLVM_TOOL_LLVM_LTO_BUILD Off)
 endif()
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index c5fe631..69de837 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -155,9 +155,12 @@ while [ $# -gt 0 ]; do
 done
 
 if [ "$use_autoconf" = "no" ]; then
-  # See llvm.org/PR26146.
-  echo Skipping test-suite when using CMake.
-  do_test_suite="no"
+  if [ "$do_test_suite" = "yes" ]; then
+    # See llvm.org/PR26146.
+    echo Skipping test-suite build when using CMake.
+    echo It will still be exported.
+    do_test_suite="export-only"
+  fi
 fi
 
 # Check required arguments.
@@ -202,9 +205,11 @@ if [ $do_libs = "yes" ]; then
     projects="$projects libunwind"
   fi
 fi
-if [ $do_test_suite = "yes" ]; then
-  projects="$projects test-suite"
-fi
+case $do_test_suite in
+  yes|export-only)
+    projects="$projects test-suite"
+    ;;
+esac
 if [ $do_openmp = "yes" ]; then
   projects="$projects openmp"
 fi
@@ -277,9 +282,16 @@ function export_sources() {
         clang-tools-extra)
             projsrc=llvm.src/tools/clang/tools/extra
             ;;
-        compiler-rt|libcxx|libcxxabi|libunwind|openmp|test-suite)
+        compiler-rt|libcxx|libcxxabi|libunwind|openmp)
             projsrc=llvm.src/projects/$proj
             ;;
+        test-suite)
+            if [ $do_test_suite = 'yes' ]; then
+              projsrc=llvm.src/projects/$proj
+            else
+              projsrc=$proj.src
+            fi
+            ;;
         *)
             echo "error: unknown project $proj"
             exit 1
diff --git a/utils/unittest/CMakeLists.txt b/utils/unittest/CMakeLists.txt
index b34e22a..c9a2cdd 100644
--- a/utils/unittest/CMakeLists.txt
+++ b/utils/unittest/CMakeLists.txt
@@ -32,10 +32,6 @@ if (NOT LLVM_ENABLE_THREADS)
   add_definitions( -DGTEST_HAS_PTHREAD=0 )
 endif()
 
-set(LIBS
-  LLVMSupport # Depends on llvm::raw_ostream
-)
-
 find_library(PTHREAD_LIBRARY_PATH pthread)
 if (PTHREAD_LIBRARY_PATH)
   list(APPEND LIBS pthread)
@@ -46,6 +42,9 @@ add_llvm_library(gtest
 
   LINK_LIBS
   ${LIBS}
+
+  LINK_COMPONENTS
+  Support # Depends on llvm::raw_ostream
 )
 
 add_subdirectory(UnitTestMain)
diff --git a/utils/unittest/UnitTestMain/CMakeLists.txt b/utils/unittest/UnitTestMain/CMakeLists.txt
index 65ef97b..520db4e 100644
--- a/utils/unittest/UnitTestMain/CMakeLists.txt
+++ b/utils/unittest/UnitTestMain/CMakeLists.txt
@@ -3,5 +3,7 @@ add_llvm_library(gtest_main
 
   LINK_LIBS
   gtest
-  LLVMSupport # Depends on llvm::cl
+
+  LINK_COMPONENTS
+  Support # Depends on llvm::cl
   )
-- 
cgit v1.1