Vendor import of llvm RELEASE_360/rc1 tag r226102 (effectively, 3.6.0 RC1):

https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_360/rc1@226102
author: dim <dim@FreeBSD.org> 2015-01-18 16:17:27 +0000
committer: dim <dim@FreeBSD.org> 2015-01-18 16:17:27 +0000
commit: 081af4da16b9046c019ca40f64b1fb7ee8c6dca1 (patch)
tree: 4abb9cbeecc7901726dd0b4a37369596c852e9ef /lib/Target/AArch64
parent: 3c7e7a1538a873b0d3b012ef8811969ac4552c2a (diff)
download: FreeBSD-src-081af4da16b9046c019ca40f64b1fb7ee8c6dca1.zip
FreeBSD-src-081af4da16b9046c019ca40f64b1fb7ee8c6dca1.tar.gz
76 files changed, 9622 insertions, 1836 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 1c022aa..e96d18b 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -12,13 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TARGET_AArch64_H
-#define TARGET_AArch64_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64_H
 
-#include "Utils/AArch64BaseInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
+#include "Utils/AArch64BaseInfo.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
@@ -36,7 +36,10 @@ FunctionPass *createAArch64StorePairSuppressPass();
 FunctionPass *createAArch64ExpandPseudoPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
 ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
+FunctionPass *createAArch64A57FPLoadBalancing();
+FunctionPass *createAArch64A53Fix835769();
 /// \brief Creates an ARM-specific Target Transformation Info pass.
 ImmutablePass *
 createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
new file mode 100644
index 0000000..852a635
--- /dev/null
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -0,0 +1,240 @@
+//===-- AArch64A53Fix835769.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass changes code to work around Cortex-A53 erratum 835769.
+// It works around it by inserting a nop instruction in code sequences that
+// in some circumstances may trigger the erratum.
+// It inserts a nop instruction between a sequence of the following 2 classes
+// of instructions:
+// instr 1: mem-instr (including loads, stores and prefetches).
+// instr 2: non-SIMD integer multiply-accumulate writing 64-bit X registers.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-fix-cortex-a53-835769"
+
+STATISTIC(NumNopsAdded, "Number of Nops added to work around erratum 835769");
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a match for the instruction that comes first in the
+// sequence of instructions that can trigger the erratum?
+static bool isFirstInstructionInSequence(MachineInstr *MI) {
+  // Must return true if this instruction is a load, a store or a prefetch.
+  switch (MI->getOpcode()) {
+  case AArch64::PRFMl:
+  case AArch64::PRFMroW:
+  case AArch64::PRFMroX:
+  case AArch64::PRFMui:
+  case AArch64::PRFUMi:
+    return true;
+  default:
+    return (MI->mayLoad() || MI->mayStore());
+  }
+}
+
+// Is the instruction a match for the instruction that comes second in the
+// sequence that can trigger the erratum?
+static bool isSecondInstructionInSequence(MachineInstr *MI) {
+  // Must return true for non-SIMD integer multiply-accumulates, writing
+  // to a 64-bit register.
+  switch (MI->getOpcode()) {
+  // Erratum cannot be triggered when the destination register is 32 bits,
+  // therefore only include the following.
+  case AArch64::MSUBXrrr:
+  case AArch64::MADDXrrr:
+  case AArch64::SMADDLrrr:
+  case AArch64::SMSUBLrrr:
+  case AArch64::UMADDLrrr:
+  case AArch64::UMSUBLrrr:
+    // Erratum can only be triggered by multiply-adds, not by regular
+    // non-accumulating multiplies, i.e. when Ra=XZR='11111'
+    return MI->getOperand(3).getReg() != AArch64::XZR;
+  default:
+    return false;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64A53Fix835769 : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+
+public:
+  static char ID;
+  explicit AArch64A53Fix835769() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override {
+    return "Workaround A53 erratum 835769 pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool runOnBasicBlock(MachineBasicBlock &MBB);
+};
+char AArch64A53Fix835769::ID = 0;
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+bool
+AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
+  const TargetMachine &TM = F.getTarget();
+
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
+
+  TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  for (auto &MBB : F) {
+    Changed |= runOnBasicBlock(MBB);
+  }
+
+  return Changed;
+}
+
+// Return the block that was fallen through to get to MBB, if any,
+// otherwise nullptr.
+static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
+                                             const TargetInstrInfo *TII) {
+  // Get the previous machine basic block in the function.
+  MachineFunction::iterator MBBI = *MBB;
+
+  // Can't go off top of function.
+  if (MBBI == MBB->getParent()->begin())
+    return nullptr;
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 2> Cond;
+
+  MachineBasicBlock *PrevBB = std::prev(MBBI);
+  for (MachineBasicBlock *S : MBB->predecessors())
+    if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) &&
+        !TBB && !FBB)
+      return S;
+
+  return nullptr;
+}
+
+// Iterate through fallen through blocks trying to find a previous non-pseudo if
+// there is one, otherwise return nullptr. Only look for instructions in
+// previous blocks, not the current block, since we only use this to look at
+// previous blocks.
+static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB,
+                                      const TargetInstrInfo *TII) {
+  MachineBasicBlock *FMBB = &MBB;
+
+  // If there is no non-pseudo in the current block, loop back around and try
+  // the previous block (if there is one).
+  while ((FMBB = getBBFallenThrough(FMBB, TII))) {
+    for (auto I = FMBB->rbegin(), E = FMBB->rend(); I != E; ++I) {
+      if (!I->isPseudo())
+        return &*I;
+    }
+  }
+
+  // There was no previous non-pseudo in the fallen through blocks
+  return nullptr;
+}
+
+static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI,
+                                       const TargetInstrInfo *TII) {
+  // If we are the first instruction of the block, put the NOP at the end of
+  // the previous fallthrough block
+  if (MI == &MBB.front()) {
+    MachineInstr *I = getLastNonPseudo(MBB, TII);
+    assert(I && "Expected instruction");
+    DebugLoc DL = I->getDebugLoc();
+    BuildMI(I->getParent(), DL, TII->get(AArch64::HINT)).addImm(0);
+  }
+  else {
+    DebugLoc DL = MI->getDebugLoc();
+    BuildMI(MBB, MI, DL, TII->get(AArch64::HINT)).addImm(0);
+  }
+
+  ++NumNopsAdded;
+}
+
+bool
+AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+  // First, scan the basic block, looking for a sequence of 2 instructions
+  // that match the conditions under which the erratum may trigger.
+
+  // List of terminating instructions in matching sequences
+  std::vector<MachineInstr*> Sequences;
+  unsigned Idx = 0;
+  MachineInstr *PrevInstr = nullptr;
+
+  // Try and find the last non-pseudo instruction in any fallen through blocks,
+  // if there isn't one, then we use nullptr to represent that.
+  PrevInstr = getLastNonPseudo(MBB, TII);
+
+  for (auto &MI : MBB) {
+    MachineInstr *CurrInstr = &MI;
+    DEBUG(dbgs() << "  Examining: " << MI);
+    if (PrevInstr) {
+      DEBUG(dbgs() << "    PrevInstr: " << *PrevInstr
+                   << "    CurrInstr: " << *CurrInstr
+                   << "    isFirstInstructionInSequence(PrevInstr): "
+                   << isFirstInstructionInSequence(PrevInstr) << "\n"
+                   << "    isSecondInstructionInSequence(CurrInstr): "
+                   << isSecondInstructionInSequence(CurrInstr) << "\n");
+      if (isFirstInstructionInSequence(PrevInstr) &&
+          isSecondInstructionInSequence(CurrInstr)) {
+        DEBUG(dbgs() << "   ** pattern found at Idx " << Idx << "!\n");
+        Sequences.push_back(CurrInstr);
+      }
+    }
+    if (!CurrInstr->isPseudo())
+      PrevInstr = CurrInstr;
+    ++Idx;
+  }
+
+  DEBUG(dbgs() << "Scan complete, "<< Sequences.size()
+               << " occurences of pattern found.\n");
+
+  // Then update the basic block, inserting nops between the detected sequences.
+  for (auto &MI : Sequences) {
+    Changed = true;
+    insertNopBeforeInstruction(MBB, MI, TII);
+  }
+
+  return Changed;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to
+// the passmanager.
+FunctionPass *llvm::createAArch64A53Fix835769() {
+  return new AArch64A53Fix835769();
+}
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
new file mode 100644
index 0000000..dd1a1ea
--- /dev/null
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -0,0 +1,706 @@
+//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// For best-case performance on Cortex-A57, we should try to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply or
+// multiply-accumulate operations.
+//
+// This pass attempts to detect situations where the register allocation may
+// adversely affect this load balancing and to change the registers used so as
+// to better utilize the CPU.
+//
+// Ideally we'd just take each multiply or multiply-accumulate in turn and
+// allocate it alternating even or odd registers. However, multiply-accumulates
+// are most efficiently performed in the same functional unit as their
+// accumulation operand. Therefore this pass tries to find maximal sequences
+// ("Chains") of multiply-accumulates linked via their accumulation operand,
+// and assign them all the same "color" (oddness/evenness).
+//
+// This optimization affects S-register and D-register floating point
+// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and
+// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are
+// not affected.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <list>
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-a57-fp-load-balancing"
+
+// Enforce the algorithm to use the scavenged register even when the original
+// destination register is the correct color. Used for testing.
+static cl::opt<bool>
+TransformAll("aarch64-a57-fp-load-balancing-force-all",
+             cl::desc("Always modify dest registers regardless of color"),
+             cl::init(false), cl::Hidden);
+
+// Never use the balance information obtained from chains - return a specific
+// color always. Used for testing.
+static cl::opt<unsigned>
+OverrideBalance("aarch64-a57-fp-load-balancing-override",
+              cl::desc("Ignore balance information, always return "
+                       "(1: Even, 2: Odd)."),
+              cl::init(0), cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs?
+static bool isMul(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AArch64::FMULSrr:
+  case AArch64::FNMULSrr:
+  case AArch64::FMULDrr:
+  case AArch64::FNMULDrr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs?
+static bool isMla(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AArch64::FMSUBSrrr:
+  case AArch64::FMADDSrrr:
+  case AArch64::FNMSUBSrrr:
+  case AArch64::FNMADDSrrr:
+  case AArch64::FMSUBDrrr:
+  case AArch64::FMADDDrrr:
+  case AArch64::FNMSUBDrrr:
+  case AArch64::FNMADDDrrr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A "color", which is either even or odd. Yes, these aren't really colors
+/// but the algorithm is conceptually doing two-color graph coloring.
+enum class Color { Even, Odd };
+#ifndef NDEBUG
+static const char *ColorNames[2] = { "Even", "Odd" };
+#endif
+
+class Chain;
+
+class AArch64A57FPLoadBalancing : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+  RegisterClassInfo RCI;
+
+public:
+  static char ID;
+  explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override {
+    return "A57 FP Anti-dependency breaker";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool runOnBasicBlock(MachineBasicBlock &MBB);
+  bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB,
+                     int &Balance);
+  bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB);
+  int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB);
+  void scanInstruction(MachineInstr *MI, unsigned Idx,
+                       std::map<unsigned, Chain*> &Active,
+                       std::set<std::unique_ptr<Chain>> &AllChains);
+  void maybeKillChain(MachineOperand &MO, unsigned Idx,
+                      std::map<unsigned, Chain*> &RegChains);
+  Color getColor(unsigned Register);
+  Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L);
+};
+char AArch64A57FPLoadBalancing::ID = 0;
+
+/// A Chain is a sequence of instructions that are linked together by 
+/// an accumulation operand. For example:
+///
+///   fmul d0<def>, ?
+///   fmla d1<def>, ?, ?, d0<kill>
+///   fmla d2<def>, ?, ?, d1<kill>
+///
+/// There may be other instructions interleaved in the sequence that
+/// do not belong to the chain. These other instructions must not use
+/// the "chain" register at any point.
+///
+/// We currently only support chains where the "chain" operand is killed
+/// at each link in the chain for simplicity.
+/// A chain has three important instructions - Start, Last and Kill.
+///   * The start instruction is the first instruction in the chain.
+///   * Last is the final instruction in the chain.
+///   * Kill may or may not be defined. If defined, Kill is the instruction
+///     where the outgoing value of the Last instruction is killed.
+///     This information is important as if we know the outgoing value is
+///     killed with no intervening uses, we can safely change its register.
+///
+/// Without a kill instruction, we must assume the outgoing value escapes
+/// beyond our model and either must not change its register or must
+/// create a fixup FMOV to keep the old register value consistent.
+///
+class Chain {
+public:
+  /// The important (marker) instructions.
+  MachineInstr *StartInst, *LastInst, *KillInst;
+  /// The index, from the start of the basic block, that each marker
+  /// appears. These are stored so we can do quick interval tests.
+  unsigned StartInstIdx, LastInstIdx, KillInstIdx;
+  /// All instructions in the chain.
+  std::set<MachineInstr*> Insts;
+  /// True if KillInst cannot be modified. If this is true,
+  /// we cannot change LastInst's outgoing register.
+  /// This will be true for tied values and regmasks.
+  bool KillIsImmutable;
+  /// The "color" of LastInst. This will be the preferred chain color,
+  /// as changing intermediate nodes is easy but changing the last
+  /// instruction can be more tricky.
+  Color LastColor;
+
+  Chain(MachineInstr *MI, unsigned Idx, Color C)
+      : StartInst(MI), LastInst(MI), KillInst(nullptr),
+        StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0),
+        LastColor(C) {
+    Insts.insert(MI);
+  }
+
+  /// Add a new instruction into the chain. The instruction's dest operand
+  /// has the given color.
+  void add(MachineInstr *MI, unsigned Idx, Color C) {
+    LastInst = MI;
+    LastInstIdx = Idx;
+    LastColor = C;
+    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
+           "Chain: broken invariant. A Chain can only be killed after its last "
+           "def");
+
+    Insts.insert(MI);
+  }
+
+  /// Return true if MI is a member of the chain.
+  bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; }
+
+  /// Return the number of instructions in the chain.
+  unsigned size() const {
+    return Insts.size();
+  }
+
+  /// Inform the chain that its last active register (the dest register of
+  /// LastInst) is killed by MI with no intervening uses or defs.
+  void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) {
+    KillInst = MI;
+    KillInstIdx = Idx;
+    KillIsImmutable = Immutable;
+    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
+           "Chain: broken invariant. A Chain can only be killed after its last "
+           "def");
+  }
+
+  /// Return the first instruction in the chain.
+  MachineInstr *getStart() const { return StartInst; }
+  /// Return the last instruction in the chain.
+  MachineInstr *getLast() const { return LastInst; }
+  /// Return the "kill" instruction (as set with setKill()) or NULL.
+  MachineInstr *getKill() const { return KillInst; }
+  /// Return an instruction that can be used as an iterator for the end
+  /// of the chain. This is the maximum of KillInst (if set) and LastInst.
+  MachineBasicBlock::iterator getEnd() const {
+    return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
+  }
+
+  /// Can the Kill instruction (assuming one exists) be modified?
+  bool isKillImmutable() const { return KillIsImmutable; }
+
+  /// Return the preferred color of this chain.
+  Color getPreferredColor() {
+    if (OverrideBalance != 0)
+      return OverrideBalance == 1 ? Color::Even : Color::Odd;
+    return LastColor;
+  }
+
+  /// Return true if this chain (StartInst..KillInst) overlaps with Other.
+  bool rangeOverlapsWith(const Chain &Other) const {
+    unsigned End = KillInst ? KillInstIdx : LastInstIdx;
+    unsigned OtherEnd = Other.KillInst ?
+      Other.KillInstIdx : Other.LastInstIdx;
+
+    return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End;
+  }
+
+  /// Return true if this chain starts before Other.
+  bool startsBefore(Chain *Other) {
+    return StartInstIdx < Other->StartInstIdx;
+  }
+
+  /// Return true if the group will require a fixup MOV at the end.
+  bool requiresFixup() const {
+    return (getKill() && isKillImmutable()) || !getKill();
+  }
+
+  /// Return a simple string representation of the chain.
+  std::string str() const {
+    std::string S;
+    raw_string_ostream OS(S);
+    
+    OS << "{";
+    StartInst->print(OS, NULL, true);
+    OS << " -> ";
+    LastInst->print(OS, NULL, true);
+    if (KillInst) {
+      OS << " (kill @ ";
+      KillInst->print(OS, NULL, true);
+      OS << ")";
+    }
+    OS << "}";
+
+    return OS.str();
+  }
+
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
+
+  const TargetMachine &TM = F.getTarget();
+  MRI = &F.getRegInfo();
+  TRI = F.getRegInfo().getTargetRegisterInfo();
+  TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  RCI.runOnMachineFunction(F);
+
+  for (auto &MBB : F) {
+    Changed |= runOnBasicBlock(MBB);
+  }
+
+  return Changed;
+}
+
+bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+  // First, scan the basic block producing a set of chains.
+
+  // The currently "active" chains - chains that can be added to and haven't
+  // been killed yet. This is keyed by register - all chains can only have one
+  // "link" register between each inst in the chain.
+  std::map<unsigned, Chain*> ActiveChains;
+  std::set<std::unique_ptr<Chain>> AllChains;
+  unsigned Idx = 0;
+  for (auto &MI : MBB)
+    scanInstruction(&MI, Idx++, ActiveChains, AllChains);
+
+  DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n");
+
+  // Group the chains into disjoint sets based on their liveness range. This is
+  // a poor-man's version of graph coloring. Ideally we'd create an interference
+  // graph and perform full-on graph coloring on that, but;
+  //   (a) That's rather heavyweight for only two colors.
+  //   (b) We expect multiple disjoint interference regions - in practice the live
+  //       range of chains is quite small and they are clustered between loads
+  //       and stores.
+  EquivalenceClasses<Chain*> EC;
+  for (auto &I : AllChains)
+    EC.insert(I.get());
+
+  for (auto &I : AllChains)
+    for (auto &J : AllChains)
+      if (I != J && I->rangeOverlapsWith(*J))
+        EC.unionSets(I.get(), J.get());
+  DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
+
+  // Now we assume that every member of an equivalence class interferes
+  // with every other member of that class, and with no members of other classes.
+
+  // Convert the EquivalenceClasses to a simpler set of sets.
+  std::vector<std::vector<Chain*> > V;
+  for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
+    std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end());
+    if (Cs.empty()) continue;
+    V.push_back(std::move(Cs));
+  }
+
+  // Now we have a set of sets, order them by start address so
+  // we can iterate over them sequentially.
+  std::sort(V.begin(), V.end(),
+            [](const std::vector<Chain*> &A,
+               const std::vector<Chain*> &B) {
+      return A.front()->startsBefore(B.front());
+    });
+
+  // As we only have two colors, we can track the global (BB-level) balance of
+  // odds versus evens. We aim to keep this near zero to keep both execution
+  // units fed.
+  // Positive means we're even-heavy, negative we're odd-heavy.
+  //
+  // FIXME: If chains have interdependencies, for example:
+  //   mul r0, r1, r2
+  //   mul r3, r0, r1
+  // We do not model this and may color each one differently, assuming we'll
+  // get ILP when we obviously can't. This hasn't been seen to be a problem
+  // in practice so far, so we simplify the algorithm by ignoring it.
+  int Parity = 0;
+
+  for (auto &I : V)
+    Changed |= colorChainSet(std::move(I), MBB, Parity);
+
+  return Changed;
+}
+
+Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
+                                                  std::vector<Chain*> &L) {
+  if (L.empty())
+    return nullptr;
+
+  // We try and get the best candidate from L to color next, given that our
+  // preferred color is "PreferredColor". L is ordered from larger to smaller
+  // chains. It is beneficial to color the large chains before the small chains,
+  // but if we can't find a chain of the maximum length with the preferred color,
+  // we fuzz the size and look for slightly smaller chains before giving up and
+  // returning a chain that must be recolored.
+
+  // FIXME: Does this need to be configurable?
+  const unsigned SizeFuzz = 1;
+  unsigned MinSize = L.front()->size() - SizeFuzz;
+  for (auto I = L.begin(), E = L.end(); I != E; ++I) {
+    if ((*I)->size() <= MinSize) {
+      // We've gone past the size limit. Return the previous item.
+      Chain *Ch = *--I;
+      L.erase(I);
+      return Ch;
+    }
+
+    if ((*I)->getPreferredColor() == PreferredColor) {
+      Chain *Ch = *I;
+      L.erase(I);
+      return Ch;
+    }
+  }
+  
+  // Bailout case - just return the first item.
+  Chain *Ch = L.front();
+  L.erase(L.begin());
+  return Ch;
+}
+
+bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
+                                              MachineBasicBlock &MBB,
+                                              int &Parity) {
+  bool Changed = false;
+  DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
+
+  // Sort by descending size order so that we allocate the most important
+  // sets first.
+  // Tie-break equivalent sizes by sorting chains requiring fixups before
+  // those without fixups. The logic here is that we should look at the
+  // chains that we cannot change before we look at those we can,
+  // so the parity counter is updated and we know what color we should
+  // change them to!
+  std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
+      if (G1->size() != G2->size())
+        return G1->size() > G2->size();
+      return G1->requiresFixup() > G2->requiresFixup();
+    });
+
+  Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+  while (Chain *G = getAndEraseNext(PreferredColor, GV)) {
+    // Start off by assuming we'll color to our own preferred color.
+    Color C = PreferredColor;
+    if (Parity == 0)
+      // But if we really don't care, use the chain's preferred color.
+      C = G->getPreferredColor();
+
+    DEBUG(dbgs() << " - Parity=" << Parity << ", Color="
+          << ColorNames[(int)C] << "\n");
+
+    // If we'll need a fixup FMOV, don't bother. Testing has shown that this
+    // happens infrequently and when it does it has at least a 50% chance of
+    // slowing code down instead of speeding it up.
+    if (G->requiresFixup() && C != G->getPreferredColor()) {
+      C = G->getPreferredColor();
+      DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; "
+            "color remains " << ColorNames[(int)C] << "\n");
+    }
+
+    Changed |= colorChain(G, C, MBB);
+
+    Parity += (C == Color::Even) ? G->size() : -G->size();
+    PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+  }
+
+  return Changed;
+}
+
+int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
+                                                MachineBasicBlock &MBB) {
+  RegScavenger RS;
+  RS.enterBasicBlock(&MBB);
+  RS.forward(MachineBasicBlock::iterator(G->getStart()));
+
+  // Can we find an appropriate register that is available throughout the life 
+  // of the chain?
+  unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
+  BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
+       I != E; ++I) {
+    RS.forward(I);
+    AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+
+    // Remove any registers clobbered by a regmask or any def register that is
+    // immediately dead.
+    for (auto J : I->operands()) {
+      if (J.isRegMask())
+        AvailableRegs.clearBitsNotInMask(J.getRegMask());
+
+      if (J.isReg() && J.isDef() && AvailableRegs[J.getReg()]) {
+        assert(J.isDead() && "Non-dead def should have been removed by now!");
+        AvailableRegs.reset(J.getReg());
+      }
+    }
+  }
+
+  // Make sure we allocate in-order, to get the cheapest registers first.
+  auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
+  for (auto Reg : Ord) {
+    if (!AvailableRegs[Reg])
+      continue;
+    if ((C == Color::Even && (Reg % 2) == 0) ||
+        (C == Color::Odd && (Reg % 2) == 1))
+      return Reg;
+  }
+
+  return -1;
+}
+
+bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
+                                           MachineBasicBlock &MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
+        << ColorNames[(int)C] << ")\n");
+
+  // Try and obtain a free register of the right class. Without a register
+  // to play with we cannot continue.
+  int Reg = scavengeRegister(G, C, MBB);
+  if (Reg == -1) {
+    DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
+    return false;
+  }
+  DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n");
+
+  std::map<unsigned, unsigned> Substs;
+  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
+       I != E; ++I) {
+    if (!G->contains(I) &&
+        (&*I != G->getKill() || G->isKillImmutable()))
+      continue;
+
+    // I is a member of G, or I is a mutable instruction that kills G.
+
+    std::vector<unsigned> ToErase;
+    for (auto &U : I->operands()) {
+      if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
+        unsigned OrigReg = U.getReg();
+        U.setReg(Substs[OrigReg]);
+        if (U.isKill())
+          // Don't erase straight away, because there may be other operands
+          // that also reference this substitution!
+          ToErase.push_back(OrigReg);
+      } else if (U.isRegMask()) {
+        for (auto J : Substs) {
+          if (U.clobbersPhysReg(J.first))
+            ToErase.push_back(J.first);
+        }
+      }
+    }
+    // Now it's safe to remove the substs identified earlier.
+    for (auto J : ToErase)
+      Substs.erase(J);
+
+    // Only change the def if this isn't the last instruction.
+    if (&*I != G->getKill()) {
+      MachineOperand &MO = I->getOperand(0);
+
+      bool Change = TransformAll || getColor(MO.getReg()) != C;
+      if (G->requiresFixup() && &*I == G->getLast())
+        Change = false;
+
+      if (Change) {
+        Substs[MO.getReg()] = Reg;
+        MO.setReg(Reg);
+        MRI->setPhysRegUsed(Reg);
+
+        Changed = true;
+      }
+    }
+  }
+  assert(Substs.size() == 0 && "No substitutions should be left active!");
+
+  if (G->getKill()) {
+    DEBUG(dbgs() << " - Kill instruction seen.\n");
+  } else {
+    // We didn't have a kill instruction, but we didn't seem to need to change
+    // the destination register anyway.
+    DEBUG(dbgs() << " - Destination register not changed.\n");
+  }
+  return Changed;
+}
+
+void AArch64A57FPLoadBalancing::
+scanInstruction(MachineInstr *MI, unsigned Idx, 
+                std::map<unsigned, Chain*> &ActiveChains,
+                std::set<std::unique_ptr<Chain>> &AllChains) {
+  // Inspect "MI", updating ActiveChains and AllChains.
+
+  if (isMul(MI)) {
+
+    for (auto &I : MI->uses())
+      maybeKillChain(I, Idx, ActiveChains);
+    for (auto &I : MI->defs())
+      maybeKillChain(I, Idx, ActiveChains);
+
+    // Create a new chain. Multiplies don't require forwarding so can go on any
+    // unit.
+    unsigned DestReg = MI->getOperand(0).getReg();
+
+    DEBUG(dbgs() << "New chain started for register "
+          << TRI->getName(DestReg) << " at " << *MI);
+
+    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+    ActiveChains[DestReg] = G.get();
+    AllChains.insert(std::move(G));
+
+  } else if (isMla(MI)) {
+
+    // It is beneficial to keep MLAs on the same functional unit as their
+    // accumulator operand.
+    unsigned DestReg  = MI->getOperand(0).getReg();
+    unsigned AccumReg = MI->getOperand(3).getReg();
+
+    maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
+    maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
+    if (DestReg != AccumReg)
+      maybeKillChain(MI->getOperand(0), Idx, ActiveChains);
+
+    if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
+      DEBUG(dbgs() << "Chain found for accumulator register "
+            << TRI->getName(AccumReg) << " in MI " << *MI);
+
+      // For simplicity we only chain together sequences of MULs/MLAs where the
+      // accumulator register is killed on each instruction. This means we don't
+      // need to track other uses of the registers we want to rewrite.
+      //
+      // FIXME: We could extend to handle the non-kill cases for more coverage.
+      if (MI->getOperand(3).isKill()) {
+        // Add to chain.
+        DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
+        ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
+        // Handle cases where the destination is not the same as the accumulator.
+        if (DestReg != AccumReg) {
+          ActiveChains[DestReg] = ActiveChains[AccumReg];
+          ActiveChains.erase(AccumReg);
+        }
+        return;
+      }
+
+      DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't "
+            << "marked <kill>!\n");
+      maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
+    }
+
+    DEBUG(dbgs() << "Creating new chain for dest register "
+          << TRI->getName(DestReg) << "\n");
+    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+    ActiveChains[DestReg] = G.get();
+    AllChains.insert(std::move(G));
+
+  } else {
+
+    // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs
+    // lists.
+    for (auto &I : MI->uses())
+      maybeKillChain(I, Idx, ActiveChains);
+    for (auto &I : MI->defs())
+      maybeKillChain(I, Idx, ActiveChains);
+
+  }
+}
+
+void AArch64A57FPLoadBalancing::
+maybeKillChain(MachineOperand &MO, unsigned Idx,
+               std::map<unsigned, Chain*> &ActiveChains) {
+  // Given an operand and the set of active chains (keyed by register),
+  // determine if a chain should be ended and remove from ActiveChains.
+  MachineInstr *MI = MO.getParent();
+
+  if (MO.isReg()) {
+
+    // If this is a KILL of a current chain, record it.
+    if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) {
+      DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg())
+            << "\n");
+      ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
+    }
+    ActiveChains.erase(MO.getReg());
+
+  } else if (MO.isRegMask()) {
+
+    for (auto I = ActiveChains.begin(), E = ActiveChains.end();
+         I != E;) {
+      if (MO.clobbersPhysReg(I->first)) {
+        DEBUG(dbgs() << "Kill (regmask) seen for chain "
+              << TRI->getName(I->first) << "\n");
+        I->second->setKill(MI, Idx, /*Immutable=*/true);
+        ActiveChains.erase(I++);
+      } else
+        ++I;
+    }
+
+  }
+}
+
+Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) {
+  if ((TRI->getEncodingValue(Reg) % 2) == 0)
+    return Color::Even;
+  else
+    return Color::Odd;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to the passmanager.
+FunctionPass *llvm::createAArch64A57FPLoadBalancing() {
+  return new AArch64A57FPLoadBalancing();
+}
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index ab2c4b7..287989f 100644
--- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -19,7 +19,7 @@
 // a = add nsw i64 f, 3
 // e = getelementptr ..., i64 a
 //
-// This is legal to do so if the computations are markers with either nsw or nuw
+// This is legal to do if the computations are marked with either nsw or nuw
 // markers.
 // Moreover, the current heuristic is simple: it does not create new sext
 // operations, i.e., it gives up when a sext would have forked (e.g., if
@@ -223,7 +223,7 @@ AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
 }
 
 // Input:
-// - SExtInsts contains all the sext instructions that are use direclty in
+// - SExtInsts contains all the sext instructions that are used directly in
 //   GetElementPtrInst, i.e., access to memory.
 // Algorithm:
 // - For each sext operation in SExtInsts:
@@ -353,7 +353,7 @@ AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
 
     // If the use is already of the right type, connect its uses to its argument
     // and delete it.
-    // This can happen for an Instruction which all uses are sign extended.
+    // This can happen for an Instruction all uses of which are sign extended.
     if (!ToRemove.count(SExt) &&
         SExt->getType() == SExt->getOperand(0)->getType()) {
       DEBUG(dbgs() << "Sign extension is useless, attach its use to "
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 734fb21..5afe0f4 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -36,9 +36,10 @@
 #include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -166,6 +167,12 @@ static int getTransformOpcode(unsigned Opc) {
     return AArch64::ADDv1i64;
   case AArch64::SUBXrr:
     return AArch64::SUBv1i64;
+  case AArch64::ANDXrr:
+    return AArch64::ANDv8i8;
+  case AArch64::EORXrr:
+    return AArch64::EORv8i8;
+  case AArch64::ORRXrr:
+    return AArch64::ORRv8i8;
   }
   // No AdvSIMD equivalent, so just return the original opcode.
   return Opc;
@@ -371,7 +378,8 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
 
   const TargetMachine &TM = mf.getTarget();
   MRI = &mf.getRegInfo();
-  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
 
   // Just check things on a one-block-at-a-time basis.
   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index cd94e24..08ee687 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "AArch64MachineFunctionInfo.h"
 #include "AArch64MCInstLower.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "InstPrinter/AArch64InstPrinter.h"
@@ -23,8 +23,8 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -54,7 +54,7 @@ public:
   AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer),
         Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
-        MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr),
+        MCInstLowering(OutContext, *this), SM(*this), AArch64FI(nullptr),
         LOHLabelCounter(0) {}
 
   const char *getPassName() const override {
@@ -145,7 +145,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
+      const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);
@@ -252,8 +252,8 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
                                            const TargetRegisterClass *RC,
                                            bool isVector, raw_ostream &O) {
   assert(MO.isReg() && "Should only get here with a register!");
-  const AArch64RegisterInfo *RI =
-      static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo());
+  const AArch64RegisterInfo *RI = static_cast<const AArch64RegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
   unsigned Reg = MO.getReg();
   unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
   assert(RI->regsOverlap(RegToPrint, Reg));
@@ -381,8 +381,23 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
   unsigned NumNOPBytes = MI.getOperand(1).getImm();
 
   SM.recordStackMap(MI);
-  // Emit padding.
   assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+
+  // Scan ahead to trim the shadow.
+  const MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::const_iterator MII(MI);
+  ++MII;
+  while (NumNOPBytes > 0) {
+    if (MII == MBB.end() || MII->isCall() ||
+        MII->getOpcode() == AArch64::DBG_VALUE ||
+        MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+        MII->getOpcode() == TargetOpcode::STACKMAP)
+      break;
+    ++MII;
+    NumNOPBytes -= 4;
+  }
+
+  // Emit nops.
   for (unsigned i = 0; i < NumNOPBytes; i += 4)
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 }
@@ -518,7 +533,5 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 extern "C" void LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
   RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
-
-  RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget);
-  RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64Target);
 }
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index 484e7e8..e2b6367 100644
--- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -12,15 +12,16 @@
 #include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-branch-relax"
@@ -136,7 +137,7 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   if (NextBB == MBB->getParent()->end())
     return false;
 
-  for (MachineBasicBlock *S : MBB->successors()) 
+  for (MachineBasicBlock *S : MBB->successors())
     if (S == NextBB)
       return true;
 
@@ -475,7 +476,9 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
 
   DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
 
-  TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo();
+  TII = (const AArch64InstrInfo *)MF->getTarget()
+            .getSubtargetImpl()
+            ->getInstrInfo();
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
new file mode 100644
index 0000000..baf80bc
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -0,0 +1,141 @@
+//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the AArch64 Calling Convention
+// that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace {
+using namespace llvm;
+
+static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+                                    AArch64::X3, AArch64::X4, AArch64::X5,
+                                    AArch64::X6, AArch64::X7};
+static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+                                    AArch64::H3, AArch64::H4, AArch64::H5,
+                                    AArch64::H6, AArch64::H7};
+static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+                                    AArch64::S3, AArch64::S4, AArch64::S5,
+                                    AArch64::S6, AArch64::S7};
+static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+                                    AArch64::D3, AArch64::D4, AArch64::D5,
+                                    AArch64::D6, AArch64::D7};
+static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+                                    AArch64::Q3, AArch64::Q4, AArch64::Q5,
+                                    AArch64::Q6, AArch64::Q7};
+
+static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
+                             MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
+                             CCState &State, unsigned SlotAlign) {
+  unsigned Size = LocVT.getSizeInBits() / 8;
+  unsigned StackAlign = State.getMachineFunction()
+                            .getSubtarget()
+                            .getDataLayout()
+                            ->getStackAlignment();
+  unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+
+  for (auto &It : PendingMembers) {
+    It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+    State.addLoc(It);
+    SlotAlign = 1;
+  }
+
+  // All pending members have now been allocated
+  PendingMembers.clear();
+  return true;
+}
+
+/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An
+/// [N x Ty] type must still be contiguous in memory though.
+static bool CC_AArch64_Custom_Stack_Block(
+      unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+      ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // block.
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
+}
+
+/// Given an [N x Ty] block, it should be passed in a consecutive sequence of
+/// registers. If no such sequence is available, mark the rest of the registers
+/// of that type as used and place the argument on the stack.
+static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // Try to allocate a contiguous block of registers, each of the correct
+  // size to hold one member.
+  ArrayRef<uint16_t> RegList;
+  if (LocVT.SimpleTy == MVT::i64)
+    RegList = XRegList;
+  else if (LocVT.SimpleTy == MVT::f16)
+    RegList = HRegList;
+  else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector())
+    RegList = SRegList;
+  else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector())
+    RegList = DRegList;
+  else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
+    RegList = QRegList;
+  else {
+    // Not an array we want to split up after all.
+    return false;
+  }
+
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // block.
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+  if (RegResult) {
+    for (auto &It : PendingMembers) {
+      It.convertToReg(RegResult);
+      State.addLoc(It);
+      ++RegResult;
+    }
+    PendingMembers.clear();
+    return true;
+  }
+
+  // Mark all regs in the class as unavailable
+  for (auto Reg : RegList)
+    State.AllocateReg(Reg);
+
+  const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+      State.getMachineFunction().getSubtarget());
+  unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
+
+  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
+}
+
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 1fe5138..1a80402 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> :
   CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
 /// CCIfBigEndian - Match only if we're in big endian mode.
 class CCIfBigEndian<CCAction A> :
-  CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
+  CCIf<"State.getMachineFunction().getSubtarget().getDataLayout()->isBigEndian()", A>;
 
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
@@ -40,6 +40,8 @@ def CC_AArch64_AAPCS : CallingConv<[
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -60,18 +62,18 @@ def CC_AArch64_AAPCS : CallingConv<[
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
            CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                    [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
   CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>,
   CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
            CCAssignToStack<16, 16>>
 ]>;
 
@@ -96,10 +98,10 @@ def RetCC_AArch64_AAPCS : CallingConv<[
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
       CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                               [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
 ]>;
 
@@ -119,6 +121,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -139,25 +143,28 @@ def CC_AArch64_DarwinPCS : CallingConv<[
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
            CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                    [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
   CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
   CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
 ]>;
 
 def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Stack_Block">>,
+
   // Handle all scalar types as either i64 or f64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
   CCIfType<[f16, f32],     CCPromoteToType<f64>>,
@@ -165,8 +172,10 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
   // Everything is on the stack.
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],   CCAssignToStack<16, 16>>
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
 ]>;
 
 // The WebKit_JS calling convention only passes the first argument (the callee)
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 4d23dc5..aab8e38 100644
--- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -94,7 +94,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
     MachineFunction *MF = I->getParent()->getParent();
     const AArch64TargetMachine *TM =
         static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-    const AArch64InstrInfo *TII = TM->getInstrInfo();
+    const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
     // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
     // code sequence assumes the address will be.
@@ -114,7 +114,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
     MachineFunction *MF = I->getParent()->getParent();
     const AArch64TargetMachine *TM =
         static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-    const AArch64InstrInfo *TII = TM->getInstrInfo();
+    const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
     // Create a virtual register for the TLS base address.
     MachineRegisterInfo &RegInfo = MF->getRegInfo();
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 6b1f096..87b545b 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -101,25 +101,26 @@
 #include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-collect-loh"
@@ -194,12 +195,14 @@ typedef SetVector<const MachineInstr *> SetOfMachineInstr;
 /// Map a basic block to a set of instructions per register.
 /// This is used to represent the exposed uses of a basic block
 /// per register.
-typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
+typedef MapVector<const MachineBasicBlock *,
+                  std::unique_ptr<SetOfMachineInstr[]>>
 BlockToSetOfInstrsPerColor;
 /// Map a basic block to an instruction per register.
 /// This is used to represent the live-out definitions of a basic block
 /// per register.
-typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
+typedef MapVector<const MachineBasicBlock *,
+                  std::unique_ptr<const MachineInstr *[]>>
 BlockToInstrPerColor;
 /// Map an instruction to a set of instructions. Used to represent the
 /// mapping def to reachable uses or use to definitions.
@@ -236,9 +239,9 @@ static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
   SetOfMachineInstr *result;
   BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
   if (it != sets.end())
-    result = it->second;
+    result = it->second.get();
   else
-    result = sets[&MBB] = new SetOfMachineInstr[nbRegs];
+    result = (sets[&MBB] = make_unique<SetOfMachineInstr[]>(nbRegs)).get();
 
   return result[reg];
 }
@@ -283,14 +286,14 @@ static void initReachingDef(MachineFunction &MF,
                             const MapRegToId &RegToId,
                             const MachineInstr *DummyOp, bool ADRPMode) {
   const TargetMachine &TM = MF.getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
 
   unsigned NbReg = RegToId.size();
 
   for (MachineBasicBlock &MBB : MF) {
-    const MachineInstr **&BBGen = Gen[&MBB];
-    BBGen = new const MachineInstr *[NbReg];
-    memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
+    auto &BBGen = Gen[&MBB];
+    BBGen = make_unique<const MachineInstr *[]>(NbReg);
+    std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr);
 
     BitVector &BBKillSet = Kill[&MBB];
     BBKillSet.resize(NbReg);
@@ -421,22 +424,6 @@ static void reachingDefAlgorithm(MachineFunction &MF,
   } while (HasChanged);
 }
 
-/// Release all memory dynamically allocated during the reaching
-/// definition algorithm.
-static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
-                             BlockToSetOfInstrsPerColor &Out,
-                             BlockToInstrPerColor &Gen,
-                             BlockToSetOfInstrsPerColor &ReachableUses) {
-  for (auto &IT : Out)
-    delete[] IT.second;
-  for (auto &IT : In)
-    delete[] IT.second;
-  for (auto &IT : ReachableUses)
-    delete[] IT.second;
-  for (auto &IT : Gen)
-    delete[] IT.second;
-}
-
 /// Reaching definition algorithm.
 /// \param MF function on which the algorithm will operate.
 /// \param[out] ColorOpToReachedUses will contain the result of the reaching
@@ -473,9 +460,6 @@ static void reachingDef(MachineFunction &MF,
   if (!DummyOp)
     reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
                          ReachableUses, RegToId.size());
-
-  // finit.
-  finitReachingDef(In, Out, Gen, ReachableUses);
 }
 
 #ifndef NDEBUG
@@ -1043,7 +1027,7 @@ static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
 
 bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
   const TargetMachine &TM = MF.getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
   const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
 
   MapRegToId RegToId;
@@ -1059,8 +1043,8 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
 
   MachineInstr *DummyOp = nullptr;
   if (BasicBlockScopeOnly) {
-    const AArch64InstrInfo *TII =
-        static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+    const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>(
+        TM.getSubtargetImpl()->getInstrInfo());
     // For local analysis, create a dummy operation to record uses that are not
     // local.
     DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
new file mode 100644
index 0000000..0fbd3c6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -0,0 +1,422 @@
+//=- AArch64ConditionOptimizer.cpp - Remove useless comparisons for AArch64 -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to make consecutive compares of values use same operands to
+// allow CSE pass to remove duplicated instructions.  For this it analyzes
+// branches and adjusts comparisons with immediate values by converting:
+//  * GE -> GT
+//  * GT -> GE
+//  * LT -> LE
+//  * LE -> LT
+// and adjusting immediate values appropriately.  It basically corrects two
+// immediate values towards each other to make them equal.
+//
+// Consider the following example in C:
+//
+//   if ((a < 5 && ...) || (a > 5 && ...)) {
+//        ~~~~~             ~~~~~
+//          ^                 ^
+//          x                 y
+//
+// Here both "x" and "y" expressions compare "a" with "5".  When "x" evaluates
+// to "false", "y" can just check flags set by the first comparison.  As a
+// result of the canonicalization employed by
+// SelectionDAGBuilder::visitSwitchCase, DAGCombine, and other target-specific
+// code, assembly ends up in the form that is not CSE friendly:
+//
+//     ...
+//     cmp      w8, #4
+//     b.gt     .LBB0_3
+//     ...
+//   .LBB0_3:
+//     cmp      w8, #6
+//     b.lt     .LBB0_6
+//     ...
+//
+// Same assembly after the pass:
+//
+//     ...
+//     cmp      w8, #5
+//     b.ge     .LBB0_3
+//     ...
+//   .LBB0_3:
+//     cmp      w8, #5     // <-- CSE pass removes this instruction
+//     b.le     .LBB0_6
+//     ...
+//
+// Currently only SUBS and ADDS followed by b.?? are supported.
+//
+// TODO: maybe handle TBNZ/TBZ the same way as CMP when used instead for "a < 0"
+// TODO: handle other conditional instructions (e.g. CSET)
+// TODO: allow second branching to be anything if it doesn't require adjusting
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cstdlib>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-condopt"
+
+STATISTIC(NumConditionsAdjusted, "Number of conditions adjusted");
+
+namespace {
+class AArch64ConditionOptimizer : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  MachineDominatorTree *DomTree;
+
+public:
+  // Stores immediate, compare instruction opcode and branch condition (in this
+  // order) of adjusted comparison.
+  typedef std::tuple<int, int, AArch64CC::CondCode> CmpInfo;
+
+  static char ID;
+  AArch64ConditionOptimizer() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  MachineInstr *findSuitableCompare(MachineBasicBlock *MBB);
+  CmpInfo adjustCmp(MachineInstr *CmpMI, AArch64CC::CondCode Cmp);
+  void modifyCmp(MachineInstr *CmpMI, const CmpInfo &Info);
+  bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To,
+                int ToImm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override {
+    return "AArch64 Condition Optimizer";
+  }
+};
+} // end anonymous namespace
+
+char AArch64ConditionOptimizer::ID = 0;
+
+namespace llvm {
+void initializeAArch64ConditionOptimizerPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt",
+                      "AArch64 CondOpt Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt",
+                    "AArch64 CondOpt Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionOptimizerPass() {
+  return new AArch64ConditionOptimizer();
+}
+
+void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+// Finds compare instruction that corresponds to supported types of branching.
+// Returns the instruction or nullptr on failures or detecting unsupported
+// instructions.
+MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return nullptr;
+
+  if (I->getOpcode() != AArch64::Bcc)
+    return nullptr;
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    --I;
+    assert(!I->isTerminator() && "Spurious terminator");
+    switch (I->getOpcode()) {
+    // cmp is an alias for subs with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri:
+    // cmn is an alias for adds with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
+      if (I->getOperand(0).isDead())
+        return I;
+
+      DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+      return nullptr;
+
+    // Prevent false positive case like:
+    // cmp      w19, #0
+    // cinc     w0, w19, gt
+    // ...
+    // fcmp     d8, #0.0
+    // b.gt     .LBB0_5
+    case AArch64::FCMPDri:
+    case AArch64::FCMPSri:
+    case AArch64::FCMPESri:
+    case AArch64::FCMPEDri:
+
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXrr:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXrr:
+    case AArch64::FCMPSrr:
+    case AArch64::FCMPDrr:
+    case AArch64::FCMPESrr:
+    case AArch64::FCMPEDrr:
+      // Skip comparison instructions without immediate operands.
+      return nullptr;
+    }
+  }
+  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+  return nullptr;
+}
+
+// Changes opcode adds <-> subs considering register operand width.
+static int getComplementOpc(int Opc) {
+  switch (Opc) {
+  case AArch64::ADDSWri: return AArch64::SUBSWri;
+  case AArch64::ADDSXri: return AArch64::SUBSXri;
+  case AArch64::SUBSWri: return AArch64::ADDSWri;
+  case AArch64::SUBSXri: return AArch64::ADDSXri;
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+// Changes form of comparison inclusive <-> exclusive.
+static AArch64CC::CondCode getAdjustedCmp(AArch64CC::CondCode Cmp) {
+  switch (Cmp) {
+  case AArch64CC::GT: return AArch64CC::GE;
+  case AArch64CC::GE: return AArch64CC::GT;
+  case AArch64CC::LT: return AArch64CC::LE;
+  case AArch64CC::LE: return AArch64CC::LT;
+  default:
+    llvm_unreachable("Unexpected condition code");
+  }
+}
+
+// Transforms GT -> GE, GE -> GT, LT -> LE, LE -> LT by updating comparison
+// operator and condition code.
+AArch64ConditionOptimizer::CmpInfo AArch64ConditionOptimizer::adjustCmp(
+    MachineInstr *CmpMI, AArch64CC::CondCode Cmp) {
+  int Opc = CmpMI->getOpcode();
+
+  // CMN (compare with negative immediate) is an alias to ADDS (as
+  // "operand - negative" == "operand + positive")
+  bool Negative = (Opc == AArch64::ADDSWri || Opc == AArch64::ADDSXri);
+
+  int Correction = (Cmp == AArch64CC::GT) ? 1 : -1;
+  // Negate Correction value for comparison with negative immediate (CMN).
+  if (Negative) {
+    Correction = -Correction;
+  }
+
+  const int OldImm = (int)CmpMI->getOperand(2).getImm();
+  const int NewImm = std::abs(OldImm + Correction);
+
+  // Handle +0 -> -1 and -0 -> +1 (CMN with 0 immediate) transitions by
+  // adjusting compare instruction opcode.
+  if (OldImm == 0 && ((Negative && Correction == 1) ||
+                      (!Negative && Correction == -1))) {
+    Opc = getComplementOpc(Opc);
+  }
+
+  return CmpInfo(NewImm, Opc, getAdjustedCmp(Cmp));
+}
+
+// Applies changes to comparison instruction suggested by adjustCmp().
+void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
+    const CmpInfo &Info) {
+  int Imm;
+  int Opc;
+  AArch64CC::CondCode Cmp;
+  std::tie(Imm, Opc, Cmp) = Info;
+
+  MachineBasicBlock *const MBB = CmpMI->getParent();
+
+  // Change immediate in comparison instruction (ADDS or SUBS).
+  BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc))
+      .addOperand(CmpMI->getOperand(0))
+      .addOperand(CmpMI->getOperand(1))
+      .addImm(Imm)
+      .addOperand(CmpMI->getOperand(3));
+  CmpMI->eraseFromParent();
+
+  // The fact that this comparison was picked ensures that it's related to the
+  // first terminator instruction.
+  MachineInstr *BrMI = MBB->getFirstTerminator();
+
+  // Change condition in branch instruction.
+  BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc))
+      .addImm(Cmp)
+      .addOperand(BrMI->getOperand(1));
+  BrMI->eraseFromParent();
+
+  MBB->updateTerminator();
+
+  ++NumConditionsAdjusted;
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Returns true if parsing was successful, otherwise false is returned.
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+  // A normal br.cond simply has the condition code.
+  if (Cond[0].getImm() != -1) {
+    assert(Cond.size() == 1 && "Unknown Cond array format");
+    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    return true;
+  }
+  return false;
+}
+
+// Adjusts one cmp instruction to another one if result of adjustment will allow
+// CSE.  Returns true if compare instruction was changed, otherwise false is
+// returned.
+bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
+  AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm)
+{
+  CmpInfo Info = adjustCmp(CmpMI, Cmp);
+  if (std::get<0>(Info) == ToImm && std::get<1>(Info) == To->getOpcode()) {
+    modifyCmp(CmpMI, Info);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TII = MF.getTarget().getSubtargetImpl()->getInstrInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+
+  bool Changed = false;
+
+  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+  // cmp-conversions from the same head block.
+  // Note that updateDomTree() modifies the children of the DomTree node
+  // currently being visited. The df_iterator supports that; it doesn't look at
+  // child_begin() / child_end() until after a node has been visited.
+  for (MachineDomTreeNode *I : depth_first(DomTree)) {
+    MachineBasicBlock *HBB = I->getBlock();
+
+    SmallVector<MachineOperand, 4> HeadCond;
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) {
+      continue;
+    }
+
+    // Equivalence check is to skip loops.
+    if (!TBB || TBB == HBB) {
+      continue;
+    }
+
+    SmallVector<MachineOperand, 4> TrueCond;
+    MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr;
+    if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
+      continue;
+    }
+
+    MachineInstr *HeadCmpMI = findSuitableCompare(HBB);
+    if (!HeadCmpMI) {
+      continue;
+    }
+
+    MachineInstr *TrueCmpMI = findSuitableCompare(TBB);
+    if (!TrueCmpMI) {
+      continue;
+    }
+
+    AArch64CC::CondCode HeadCmp;
+    if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) {
+      continue;
+    }
+
+    AArch64CC::CondCode TrueCmp;
+    if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) {
+      continue;
+    }
+
+    const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm();
+    const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm();
+
+    DEBUG(dbgs() << "Head branch:\n");
+    DEBUG(dbgs() << "\tcondition: "
+          << AArch64CC::getCondCodeName(HeadCmp) << '\n');
+    DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n');
+
+    DEBUG(dbgs() << "True branch:\n");
+    DEBUG(dbgs() << "\tcondition: "
+          << AArch64CC::getCondCodeName(TrueCmp) << '\n');
+    DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n');
+
+    if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) ||
+         (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) &&
+        std::abs(TrueImm - HeadImm) == 2) {
+      // This branch transforms machine instructions that correspond to
+      //
+      // 1) (a > {TrueImm} && ...) || (a < {HeadImm} && ...)
+      // 2) (a < {TrueImm} && ...) || (a > {HeadImm} && ...)
+      //
+      // into
+      //
+      // 1) (a >= {NewImm} && ...) || (a <= {NewImm} && ...)
+      // 2) (a <= {NewImm} && ...) || (a >= {NewImm} && ...)
+
+      CmpInfo HeadCmpInfo = adjustCmp(HeadCmpMI, HeadCmp);
+      CmpInfo TrueCmpInfo = adjustCmp(TrueCmpMI, TrueCmp);
+      if (std::get<0>(HeadCmpInfo) == std::get<0>(TrueCmpInfo) &&
+          std::get<1>(HeadCmpInfo) == std::get<1>(TrueCmpInfo)) {
+        modifyCmp(HeadCmpMI, HeadCmpInfo);
+        modifyCmp(TrueCmpMI, TrueCmpInfo);
+        Changed = true;
+      }
+    } else if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::GT) ||
+                (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::LT)) &&
+                std::abs(TrueImm - HeadImm) == 1) {
+      // This branch transforms machine instructions that correspond to
+      //
+      // 1) (a > {TrueImm} && ...) || (a > {HeadImm} && ...)
+      // 2) (a < {TrueImm} && ...) || (a < {HeadImm} && ...)
+      //
+      // into
+      //
+      // 1) (a <= {NewImm} && ...) || (a >  {NewImm} && ...)
+      // 2) (a <  {NewImm} && ...) || (a >= {NewImm} && ...)
+
+      // GT -> GE transformation increases immediate value, so picking the
+      // smaller one; LT -> LE decreases immediate value so invert the choice.
+      bool adjustHeadCond = (HeadImm < TrueImm);
+      if (HeadCmp == AArch64CC::LT) {
+          adjustHeadCond = !adjustHeadCond;
+      }
+
+      if (adjustHeadCond) {
+        Changed |= adjustTo(HeadCmpMI, HeadCmp, TrueCmpMI, TrueImm);
+      } else {
+        Changed |= adjustTo(TrueCmpMI, TrueCmp, HeadCmpMI, HeadImm);
+      }
+    }
+    // Other transformation cases almost never occur due to generation of < or >
+    // comparisons instead of <= and >=.
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 452cdec..54f53dc 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -191,8 +191,8 @@ public:
   /// runOnMachineFunction - Initialize per-function data structures.
   void runOnMachineFunction(MachineFunction &MF) {
     this->MF = &MF;
-    TII = MF.getTarget().getInstrInfo();
-    TRI = MF.getTarget().getRegisterInfo();
+    TII = MF.getSubtarget().getInstrInfo();
+    TRI = MF.getSubtarget().getRegisterInfo();
     MRI = &MF.getRegInfo();
   }
 
@@ -723,7 +723,7 @@ namespace {
 class AArch64ConditionalCompares : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
-  const MCSchedModel *SchedModel;
+  MCSchedModel SchedModel;
   // Does the proceeded function has Oz attribute.
   bool MinSize;
   MachineRegisterInfo *MRI;
@@ -845,7 +845,7 @@ bool AArch64ConditionalCompares::shouldConvert() {
   // the cost of a misprediction.
   //
   // Set a limit on the delay we will accept.
-  unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
+  unsigned DelayLimit = SchedModel.MispredictPenalty * 3 / 4;
 
   // Instruction depths can be computed for all trace instructions above CmpBB.
   unsigned HeadDepth =
@@ -891,8 +891,8 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
 bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
                << "********** Function: " << MF.getName() << '\n');
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   SchedModel =
       MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
   MRI = &MF.getRegInfo();
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index a2d853c..74fc167 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -14,11 +14,12 @@
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-dead-defs"
@@ -36,11 +37,11 @@ public:
   static char ID; // Pass identification, replacement for typeid.
   explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
 
-  virtual bool runOnMachineFunction(MachineFunction &F) override;
+  bool runOnMachineFunction(MachineFunction &F) override;
 
   const char *getPassName() const override { return "Dead register definitions"; }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -119,7 +120,7 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
 // Scan the function for instructions that have a dead definition of a
 // register. Replace that register with the zero register when possible.
 bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
-  TRI = MF.getTarget().getRegisterInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   bool Changed = false;
   DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
 
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 8839085..c850680 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -16,6 +16,7 @@
 
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/MathExtras.h"
@@ -722,7 +723,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
 }
 
 bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   bool Modified = false;
   for (auto &MBB : MF)
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 2164d77..419fbc8 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -14,9 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "AArch64TargetMachine.h"
+#include "AArch64CallingConvention.h"
 #include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -39,8 +41,7 @@ using namespace llvm;
 
 namespace {
 
-class AArch64FastISel : public FastISel {
-
+class AArch64FastISel final : public FastISel {
   class Address {
   public:
     typedef enum {
@@ -50,16 +51,23 @@ class AArch64FastISel : public FastISel {
 
   private:
     BaseKind Kind;
+    AArch64_AM::ShiftExtendType ExtType;
     union {
       unsigned Reg;
       int FI;
     } Base;
+    unsigned OffsetReg;
+    unsigned Shift;
     int64_t Offset;
+    const GlobalValue *GV;
 
   public:
-    Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
+    Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend),
+      OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; }
     void setKind(BaseKind K) { Kind = K; }
     BaseKind getKind() const { return Kind; }
+    void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; }
+    AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
     void setReg(unsigned Reg) {
@@ -70,6 +78,12 @@ class AArch64FastISel : public FastISel {
       assert(isRegBase() && "Invalid base register access!");
       return Base.Reg;
     }
+    void setOffsetReg(unsigned Reg) {
+      OffsetReg = Reg;
+    }
+    unsigned getOffsetReg() const {
+      return OffsetReg;
+    }
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index  access!");
       Base.FI = FI;
@@ -80,8 +94,11 @@ class AArch64FastISel : public FastISel {
     }
     void setOffset(int64_t O) { Offset = O; }
     int64_t getOffset() { return Offset; }
+    void setShift(unsigned S) { Shift = S; }
+    unsigned getShift() { return Shift; }
 
-    bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
+    void setGlobalValue(const GlobalValue *G) { GV = G; }
+    const GlobalValue *getGlobalValue() { return GV; }
   };
 
   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
@@ -89,74 +106,152 @@ class AArch64FastISel : public FastISel {
   const AArch64Subtarget *Subtarget;
   LLVMContext *Context;
 
+  bool fastLowerArguments() override;
+  bool fastLowerCall(CallLoweringInfo &CLI) override;
+  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
 private:
   // Selection routines.
-  bool SelectLoad(const Instruction *I);
-  bool SelectStore(const Instruction *I);
-  bool SelectBranch(const Instruction *I);
-  bool SelectIndirectBr(const Instruction *I);
-  bool SelectCmp(const Instruction *I);
-  bool SelectSelect(const Instruction *I);
-  bool SelectFPExt(const Instruction *I);
-  bool SelectFPTrunc(const Instruction *I);
-  bool SelectFPToInt(const Instruction *I, bool Signed);
-  bool SelectIntToFP(const Instruction *I, bool Signed);
-  bool SelectRem(const Instruction *I, unsigned ISDOpcode);
-  bool SelectCall(const Instruction *I, const char *IntrMemName);
-  bool SelectIntrinsicCall(const IntrinsicInst &I);
-  bool SelectRet(const Instruction *I);
-  bool SelectTrunc(const Instruction *I);
-  bool SelectIntExt(const Instruction *I);
-  bool SelectMul(const Instruction *I);
+  bool selectAddSub(const Instruction *I);
+  bool selectLogicalOp(const Instruction *I);
+  bool selectLoad(const Instruction *I);
+  bool selectStore(const Instruction *I);
+  bool selectBranch(const Instruction *I);
+  bool selectIndirectBr(const Instruction *I);
+  bool selectCmp(const Instruction *I);
+  bool selectSelect(const Instruction *I);
+  bool selectFPExt(const Instruction *I);
+  bool selectFPTrunc(const Instruction *I);
+  bool selectFPToInt(const Instruction *I, bool Signed);
+  bool selectIntToFP(const Instruction *I, bool Signed);
+  bool selectRem(const Instruction *I, unsigned ISDOpcode);
+  bool selectRet(const Instruction *I);
+  bool selectTrunc(const Instruction *I);
+  bool selectIntExt(const Instruction *I);
+  bool selectMul(const Instruction *I);
+  bool selectShift(const Instruction *I);
+  bool selectBitCast(const Instruction *I);
+  bool selectFRem(const Instruction *I);
+  bool selectSDiv(const Instruction *I);
+  bool selectGetElementPtr(const Instruction *I);
 
   // Utility helper routines.
   bool isTypeLegal(Type *Ty, MVT &VT);
-  bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
-  bool ComputeAddress(const Value *Obj, Address &Addr);
-  bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
-                       bool UseUnscaled);
-  void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
-                            unsigned Flags, bool UseUnscaled);
-  bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
-  bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+  bool isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed = false);
+  bool isValueAvailable(const Value *V) const;
+  bool computeAddress(const Value *Obj, Address &Addr, Type *Ty = nullptr);
+  bool computeCallAddress(const Value *V, Address &Addr);
+  bool simplifyAddress(Address &Addr, MVT VT);
+  void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+                            unsigned Flags, unsigned ScaleFactor,
+                            MachineMemOperand *MMO);
+  bool isMemCpySmall(uint64_t Len, unsigned Alignment);
+  bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
                           unsigned Alignment);
-  // Emit functions.
-  bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
-  bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                bool UseUnscaled = false);
-  bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                 bool UseUnscaled = false);
-  unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
-  unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+  bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I,
+                         const Value *Cond);
+  bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT);
+  bool optimizeSelect(const SelectInst *SI);
+  std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx);
+
+  // Emit helper routines.
+  unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+                      const Value *RHS, bool SetFlags = false,
+                      bool WantResult = true,  bool IsZExt = false);
+  unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                         bool SetFlags = false, bool WantResult = true);
+  unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, uint64_t Imm, bool SetFlags = false,
+                         bool WantResult = true);
+  unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                         AArch64_AM::ShiftExtendType ShiftType,
+                         uint64_t ShiftImm, bool SetFlags = false,
+                         bool WantResult = true);
+  unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                          AArch64_AM::ShiftExtendType ExtType,
+                          uint64_t ShiftImm, bool SetFlags = false,
+                         bool WantResult = true);
 
-  unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT);
-  unsigned AArch64MaterializeGV(const GlobalValue *GV);
+  // Emit functions.
+  bool emitCompareAndBranch(const BranchInst *BI);
+  bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt);
+  bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt);
+  bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+  bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS);
+  unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true,
+                    MachineMemOperand *MMO = nullptr);
+  bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
+                 MachineMemOperand *MMO = nullptr);
+  unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+                   bool SetFlags = false, bool WantResult = true,
+                   bool IsZExt = false);
+  unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm);
+  unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+                   bool SetFlags = false, bool WantResult = true,
+                   bool IsZExt = false);
+  unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                       unsigned RHSReg, bool RHSIsKill, bool WantResult = true);
+  unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                       unsigned RHSReg, bool RHSIsKill,
+                       AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm,
+                       bool WantResult = true);
+  unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS,
+                         const Value *RHS);
+  unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+                            bool LHSIsKill, uint64_t Imm);
+  unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+                            bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                            uint64_t ShiftImm);
+  unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+  unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                      unsigned Op1, bool Op1IsKill);
+  unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                        unsigned Op1, bool Op1IsKill);
+  unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                        unsigned Op1, bool Op1IsKill);
+  unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                      unsigned Op1Reg, bool Op1IsKill);
+  unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+                      uint64_t Imm, bool IsZExt = true);
+  unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                      unsigned Op1Reg, bool Op1IsKill);
+  unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+                      uint64_t Imm, bool IsZExt = true);
+  unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                      unsigned Op1Reg, bool Op1IsKill);
+  unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+                      uint64_t Imm, bool IsZExt = false);
+
+  unsigned materializeInt(const ConstantInt *CI, MVT VT);
+  unsigned materializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned materializeGV(const GlobalValue *GV);
 
   // Call handling routines.
 private:
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
-  bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
-                       SmallVectorImpl<unsigned> &ArgRegs,
-                       SmallVectorImpl<MVT> &ArgVTs,
-                       SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                       SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+  bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
                        unsigned &NumBytes);
-  bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                  const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
+  bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
 
 public:
   // Backend specific FastISel code.
-  unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
-  unsigned TargetMaterializeConstant(const Constant *C) override;
+  unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned fastMaterializeConstant(const Constant *C) override;
+  unsigned fastMaterializeFloatZero(const ConstantFP* CF) override;
 
-  explicit AArch64FastISel(FunctionLoweringInfo &funcInfo,
-                         const TargetLibraryInfo *libInfo)
-      : FastISel(funcInfo, libInfo) {
+  explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo,
+                         const TargetLibraryInfo *LibInfo)
+      : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
     Subtarget = &TM.getSubtarget<AArch64Subtarget>();
-    Context = &funcInfo.Fn->getContext();
+    Context = &FuncInfo.Fn->getContext();
   }
 
-  bool TargetSelectInstruction(const Instruction *I) override;
+  bool fastSelectInstruction(const Instruction *I) override;
 
 #include "AArch64GenFastISel.inc"
 };
@@ -165,13 +260,52 @@ public:
 
 #include "AArch64GenCallingConv.inc"
 
+/// \brief Check if the sign-/zero-extend will be a noop.
+static bool isIntExtFree(const Instruction *I) {
+  assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+         "Unexpected integer extend instruction.");
+  assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() &&
+         "Unexpected value type.");
+  bool IsZExt = isa<ZExtInst>(I);
+
+  if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0)))
+    if (LI->hasOneUse())
+      return true;
+
+  if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0)))
+    if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr()))
+      return true;
+
+  return false;
+}
+
+/// \brief Determine the implicit scale factor that is applied by a memory
+/// operation for a given value type.
+static unsigned getImplicitScaleFactor(MVT VT) {
+  switch (VT.SimpleTy) {
+  default:
+    return 0;    // invalid
+  case MVT::i1:  // fall-through
+  case MVT::i8:
+    return 1;
+  case MVT::i16:
+    return 2;
+  case MVT::i32: // fall-through
+  case MVT::f32:
+    return 4;
+  case MVT::i64: // fall-through
+  case MVT::f64:
+    return 8;
+  }
+}
+
 CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
   if (CC == CallingConv::WebKit_JS)
     return CC_AArch64_WebKit_JS;
   return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
 }
 
-unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) {
   assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
          "Alloca should always return a pointer.");
 
@@ -183,7 +317,7 @@ unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
       FuncInfo.StaticAllocaMap.find(AI);
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
         .addFrameIndex(SI->second)
@@ -195,28 +329,59 @@ unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   return 0;
 }
 
-unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) {
+  if (VT > MVT::i64)
+    return 0;
+
+  if (!CI->isZero())
+    return fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+
+  // Create a copy from the zero register to materialize a "0" value.
+  const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass
+                                                   : &AArch64::GPR32RegClass;
+  unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+          ResultReg).addReg(ZeroReg, getKillRegState(true));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
+  // Positive zero (+0.0) has to be materialized with a fmov from the zero
+  // register, because the immediate version of fmov cannot encode zero.
+  if (CFP->isNullValue())
+    return fastMaterializeFloatZero(CFP);
+
   if (VT != MVT::f32 && VT != MVT::f64)
     return 0;
 
   const APFloat Val = CFP->getValueAPF();
-  bool is64bit = (VT == MVT::f64);
-
+  bool Is64Bit = (VT == MVT::f64);
   // This checks to see if we can use FMOV instructions to materialize
   // a constant, otherwise we have to materialize via the constant pool.
   if (TLI.isFPImmLegal(Val, VT)) {
-    int Imm;
-    unsigned Opc;
-    if (is64bit) {
-      Imm = AArch64_AM::getFP64Imm(Val);
-      Opc = AArch64::FMOVDi;
-    } else {
-      Imm = AArch64_AM::getFP32Imm(Val);
-      Opc = AArch64::FMOVSi;
-    }
+    int Imm =
+        Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val);
+    assert((Imm != -1) && "Cannot encode floating-point constant.");
+    unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi;
+    return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+  }
+
+  // For the MachO large code model materialize the FP constant in code.
+  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+    unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm;
+    const TargetRegisterClass *RC = Is64Bit ?
+        &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+    unsigned TmpReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg)
+        .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-        .addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(TmpReg, getKillRegState(true));
+
     return ResultReg;
   }
 
@@ -226,20 +391,20 @@ unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
   if (Align == 0)
     Align = DL.getTypeAllocSize(CFP->getType());
 
-  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
   unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
-          ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE);
+          ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE);
 
-  unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui;
+  unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui;
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(ADRPReg)
-      .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+      .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   return ResultReg;
 }
 
-unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
+unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
   // We can't handle thread-local variables quickly yet.
   if (GV->isThreadLocal())
     return 0;
@@ -262,30 +427,34 @@ unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
     // ADRP + LDRX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
             ADRPReg)
-        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
 
     ResultReg = createResultReg(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
             ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
-                          AArch64II::MO_NC);
+      .addReg(ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                        AArch64II::MO_NC);
+  } else if (OpFlags & AArch64II::MO_CONSTPOOL) {
+    // We can't handle addresses loaded from a constant pool quickly yet.
+    return 0;
   } else {
     // ADRP + ADDX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
-            ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+            ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
 
     ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
-        .addImm(0);
+      .addReg(ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
+      .addImm(0);
   }
   return ResultReg;
 }
 
-unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
+unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
   EVT CEVT = TLI.getValueType(C->getType(), true);
 
   // Only handle simple types.
@@ -293,17 +462,48 @@ unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
     return 0;
   MVT VT = CEVT.getSimpleVT();
 
-  // FIXME: Handle ConstantInt.
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
-    return AArch64MaterializeFP(CFP, VT);
+  if (const auto *CI = dyn_cast<ConstantInt>(C))
+    return materializeInt(CI, VT);
+  else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return materializeFP(CFP, VT);
   else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return AArch64MaterializeGV(GV);
+    return materializeGV(GV);
 
   return 0;
 }
 
+unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) {
+  assert(CFP->isNullValue() &&
+         "Floating-point constant is not a positive zero.");
+  MVT VT;
+  if (!isTypeLegal(CFP->getType(), VT))
+    return 0;
+
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  bool Is64Bit = (VT == MVT::f64);
+  unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+  unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr;
+  return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true);
+}
+
+/// \brief Check if the multiply is by a power-of-2 constant.
+static bool isMulPowOf2(const Value *I) {
+  if (const auto *MI = dyn_cast<MulOperator>(I)) {
+    if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0)))
+      if (C->getValue().isPowerOf2())
+        return true;
+    if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(1)))
+      if (C->getValue().isPowerOf2())
+        return true;
+  }
+  return false;
+}
+
 // Computes the address to get to an object.
-bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
+{
   const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
@@ -330,18 +530,18 @@ bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
     break;
   case Instruction::BitCast: {
     // Look through bitcasts.
-    return ComputeAddress(U->getOperand(0), Addr);
+    return computeAddress(U->getOperand(0), Addr, Ty);
   }
   case Instruction::IntToPtr: {
     // Look past no-op inttoptrs.
     if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
-      return ComputeAddress(U->getOperand(0), Addr);
+      return computeAddress(U->getOperand(0), Addr, Ty);
     break;
   }
   case Instruction::PtrToInt: {
     // Look past no-op ptrtoints.
     if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
-      return ComputeAddress(U->getOperand(0), Addr);
+      return computeAddress(U->getOperand(0), Addr, Ty);
     break;
   }
   case Instruction::GetElementPtr: {
@@ -383,7 +583,7 @@ bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
 
     // Try to grab the base operand now.
     Addr.setOffset(TmpOffset);
-    if (ComputeAddress(U->getOperand(0), Addr))
+    if (computeAddress(U->getOperand(0), Addr, Ty))
       return true;
 
     // We failed, restore everything and try the other options.
@@ -403,14 +603,301 @@ bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
     }
     break;
   }
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (isa<ConstantInt>(LHS))
+      std::swap(LHS, RHS);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() + CI->getSExtValue());
+      return computeAddress(LHS, Addr, Ty);
+    }
+
+    Address Backup = Addr;
+    if (computeAddress(LHS, Addr, Ty) && computeAddress(RHS, Addr, Ty))
+      return true;
+    Addr = Backup;
+
+    break;
+  }
+  case Instruction::Sub: {
+    // Subs of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() - CI->getSExtValue());
+      return computeAddress(LHS, Addr, Ty);
+    }
+    break;
+  }
+  case Instruction::Shl: {
+    if (Addr.getOffsetReg())
+      break;
+
+    const auto *CI = dyn_cast<ConstantInt>(U->getOperand(1));
+    if (!CI)
+      break;
+
+    unsigned Val = CI->getZExtValue();
+    if (Val < 1 || Val > 3)
+      break;
+
+    uint64_t NumBytes = 0;
+    if (Ty && Ty->isSized()) {
+      uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+      NumBytes = NumBits / 8;
+      if (!isPowerOf2_64(NumBits))
+        NumBytes = 0;
+    }
+
+    if (NumBytes != (1ULL << Val))
+      break;
+
+    Addr.setShift(Val);
+    Addr.setExtendType(AArch64_AM::LSL);
+
+    const Value *Src = U->getOperand(0);
+    if (const auto *I = dyn_cast<Instruction>(Src))
+      if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
+        Src = I;
+
+    // Fold the zext or sext when it won't become a noop.
+    if (const auto *ZE = dyn_cast<ZExtInst>(Src)) {
+      if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+          Addr.setExtendType(AArch64_AM::UXTW);
+          Src = ZE->getOperand(0);
+      }
+    } else if (const auto *SE = dyn_cast<SExtInst>(Src)) {
+      if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::SXTW);
+        Src = SE->getOperand(0);
+      }
+    }
+
+    if (const auto *AI = dyn_cast<BinaryOperator>(Src))
+      if (AI->getOpcode() == Instruction::And) {
+        const Value *LHS = AI->getOperand(0);
+        const Value *RHS = AI->getOperand(1);
+
+        if (const auto *C = dyn_cast<ConstantInt>(LHS))
+          if (C->getValue() == 0xffffffff)
+            std::swap(LHS, RHS);
+
+        if (const auto *C = dyn_cast<ConstantInt>(RHS))
+          if (C->getValue() == 0xffffffff) {
+            Addr.setExtendType(AArch64_AM::UXTW);
+            unsigned Reg = getRegForValue(LHS);
+            if (!Reg)
+              return false;
+            bool RegIsKill = hasTrivialKill(LHS);
+            Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+                                             AArch64::sub_32);
+            Addr.setOffsetReg(Reg);
+            return true;
+          }
+      }
+
+    unsigned Reg = getRegForValue(Src);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+  case Instruction::Mul: {
+    if (Addr.getOffsetReg())
+      break;
+
+    if (!isMulPowOf2(U))
+      break;
+
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    // Canonicalize power-of-2 value to the RHS.
+    if (const auto *C = dyn_cast<ConstantInt>(LHS))
+      if (C->getValue().isPowerOf2())
+        std::swap(LHS, RHS);
+
+    assert(isa<ConstantInt>(RHS) && "Expected an ConstantInt.");
+    const auto *C = cast<ConstantInt>(RHS);
+    unsigned Val = C->getValue().logBase2();
+    if (Val < 1 || Val > 3)
+      break;
+
+    uint64_t NumBytes = 0;
+    if (Ty && Ty->isSized()) {
+      uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+      NumBytes = NumBits / 8;
+      if (!isPowerOf2_64(NumBits))
+        NumBytes = 0;
+    }
+
+    if (NumBytes != (1ULL << Val))
+      break;
+
+    Addr.setShift(Val);
+    Addr.setExtendType(AArch64_AM::LSL);
+
+    const Value *Src = LHS;
+    if (const auto *I = dyn_cast<Instruction>(Src))
+      if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
+        Src = I;
+
+
+    // Fold the zext or sext when it won't become a noop.
+    if (const auto *ZE = dyn_cast<ZExtInst>(Src)) {
+      if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::UXTW);
+        Src = ZE->getOperand(0);
+      }
+    } else if (const auto *SE = dyn_cast<SExtInst>(Src)) {
+      if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::SXTW);
+        Src = SE->getOperand(0);
+      }
+    }
+
+    unsigned Reg = getRegForValue(Src);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+  case Instruction::And: {
+    if (Addr.getOffsetReg())
+      break;
+
+    if (!Ty || DL.getTypeSizeInBits(Ty) != 8)
+      break;
+
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (const auto *C = dyn_cast<ConstantInt>(LHS))
+      if (C->getValue() == 0xffffffff)
+        std::swap(LHS, RHS);
+
+    if (const auto *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue() == 0xffffffff) {
+        Addr.setShift(0);
+        Addr.setExtendType(AArch64_AM::LSL);
+        Addr.setExtendType(AArch64_AM::UXTW);
+
+        unsigned Reg = getRegForValue(LHS);
+        if (!Reg)
+          return false;
+        bool RegIsKill = hasTrivialKill(LHS);
+        Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+                                         AArch64::sub_32);
+        Addr.setOffsetReg(Reg);
+        return true;
+      }
+    break;
+  }
+  case Instruction::SExt:
+  case Instruction::ZExt: {
+    if (!Addr.getReg() || Addr.getOffsetReg())
+      break;
+
+    const Value *Src = nullptr;
+    // Fold the zext or sext when it won't become a noop.
+    if (const auto *ZE = dyn_cast<ZExtInst>(U)) {
+      if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::UXTW);
+        Src = ZE->getOperand(0);
+      }
+    } else if (const auto *SE = dyn_cast<SExtInst>(U)) {
+      if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::SXTW);
+        Src = SE->getOperand(0);
+      }
+    }
+
+    if (!Src)
+      break;
+
+    Addr.setShift(0);
+    unsigned Reg = getRegForValue(Src);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+  } // end switch
+
+  if (Addr.isRegBase() && !Addr.getReg()) {
+    unsigned Reg = getRegForValue(Obj);
+    if (!Reg)
+      return false;
+    Addr.setReg(Reg);
+    return true;
+  }
+
+  if (!Addr.getOffsetReg()) {
+    unsigned Reg = getRegForValue(Obj);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+
+  return false;
+}
+
+bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  bool InMBB = true;
+
+  if (const auto *I = dyn_cast<Instruction>(V)) {
+    Opcode = I->getOpcode();
+    U = I;
+    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+  } else if (const auto *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts if its operand is in the same BB.
+    if (InMBB)
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  }
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    Addr.setGlobalValue(GV);
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!Addr.getGlobalValue()) {
+    Addr.setReg(getRegForValue(V));
+    return Addr.getReg() != 0;
   }
 
-  // Try to get this in a register if nothing else has worked.
-  if (!Addr.isValid())
-    Addr.setReg(getRegForValue(Obj));
-  return Addr.isValid();
+  return false;
 }
 
+
 bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(Ty, true);
 
@@ -428,62 +915,122 @@ bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   return TLI.isTypeLegal(VT);
 }
 
-bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+/// \brief Determine if the value type is supported by FastISel.
+///
+/// FastISel for AArch64 can handle more value types than are legal. This adds
+/// simple value type such as i1, i8, and i16.
+bool AArch64FastISel::isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed) {
+  if (Ty->isVectorTy() && !IsVectorAllowed)
+    return false;
+
   if (isTypeLegal(Ty, VT))
     return true;
 
   // If this is a type than can be sign or zero-extended to a basic operation
-  // go ahead and accept it now. For stores, this reflects truncation.
+  // go ahead and accept it now.
   if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
     return true;
 
   return false;
 }
 
-bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
-                                      int64_t ScaleFactor, bool UseUnscaled) {
-  bool needsLowering = false;
-  int64_t Offset = Addr.getOffset();
-  switch (VT.SimpleTy) {
-  default:
+bool AArch64FastISel::isValueAvailable(const Value *V) const {
+  if (!isa<Instruction>(V))
+    return true;
+
+  const auto *I = cast<Instruction>(V);
+  if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
+    return true;
+
+  return false;
+}
+
+bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
+  unsigned ScaleFactor = getImplicitScaleFactor(VT);
+  if (!ScaleFactor)
     return false;
-  case MVT::i1:
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-  case MVT::i64:
-  case MVT::f32:
-  case MVT::f64:
-    if (!UseUnscaled)
-      // Using scaled, 12-bit, unsigned immediate offsets.
-      needsLowering = ((Offset & 0xfff) != Offset);
-    else
-      // Using unscaled, 9-bit, signed immediate offsets.
-      needsLowering = (Offset > 256 || Offset < -256);
-    break;
-  }
 
-  //If this is a stack pointer and the offset needs to be simplified then put
+  bool ImmediateOffsetNeedsLowering = false;
+  bool RegisterOffsetNeedsLowering = false;
+  int64_t Offset = Addr.getOffset();
+  if (((Offset < 0) || (Offset & (ScaleFactor - 1))) && !isInt<9>(Offset))
+    ImmediateOffsetNeedsLowering = true;
+  else if (Offset > 0 && !(Offset & (ScaleFactor - 1)) &&
+           !isUInt<12>(Offset / ScaleFactor))
+    ImmediateOffsetNeedsLowering = true;
+
+  // Cannot encode an offset register and an immediate offset in the same
+  // instruction. Fold the immediate offset into the load/store instruction and
+  // emit an additonal add to take care of the offset register.
+  if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg())
+    RegisterOffsetNeedsLowering = true;
+
+  // Cannot encode zero register as base.
+  if (Addr.isRegBase() && Addr.getOffsetReg() && !Addr.getReg())
+    RegisterOffsetNeedsLowering = true;
+
+  // If this is a stack pointer and the offset needs to be simplified then put
   // the alloca address into a register, set the base type back to register and
   // continue. This should almost never happen.
-  if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
-    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+  if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase())
+  {
+    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
-        .addFrameIndex(Addr.getFI())
-        .addImm(0)
-        .addImm(0);
+      .addFrameIndex(Addr.getFI())
+      .addImm(0)
+      .addImm(0);
     Addr.setKind(Address::RegBase);
     Addr.setReg(ResultReg);
   }
 
+  if (RegisterOffsetNeedsLowering) {
+    unsigned ResultReg = 0;
+    if (Addr.getReg()) {
+      if (Addr.getExtendType() == AArch64_AM::SXTW ||
+          Addr.getExtendType() == AArch64_AM::UXTW   )
+        ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+                                  /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+                                  /*TODO:IsKill=*/false, Addr.getExtendType(),
+                                  Addr.getShift());
+      else
+        ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+                                  /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+                                  /*TODO:IsKill=*/false, AArch64_AM::LSL,
+                                  Addr.getShift());
+    } else {
+      if (Addr.getExtendType() == AArch64_AM::UXTW)
+        ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+                               /*Op0IsKill=*/false, Addr.getShift(),
+                               /*IsZExt=*/true);
+      else if (Addr.getExtendType() == AArch64_AM::SXTW)
+        ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+                               /*Op0IsKill=*/false, Addr.getShift(),
+                               /*IsZExt=*/false);
+      else
+        ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(),
+                               /*Op0IsKill=*/false, Addr.getShift());
+    }
+    if (!ResultReg)
+      return false;
+
+    Addr.setReg(ResultReg);
+    Addr.setOffsetReg(0);
+    Addr.setShift(0);
+    Addr.setExtendType(AArch64_AM::InvalidShiftExtend);
+  }
+
   // Since the offset is too large for the load/store instruction get the
   // reg+offset into a register.
-  if (needsLowering) {
-    uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
-    unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
-                                      UnscaledOffset, MVT::i64);
-    if (ResultReg == 0)
+  if (ImmediateOffsetNeedsLowering) {
+    unsigned ResultReg;
+    if (Addr.getReg())
+      // Try to fold the immediate into the add instruction.
+      ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset);
+    else
+      ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset);
+
+    if (!ResultReg)
       return false;
     Addr.setReg(ResultReg);
     Addr.setOffset(0);
@@ -491,222 +1038,1021 @@ bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
   return true;
 }
 
-void AArch64FastISel::AddLoadStoreOperands(Address &Addr,
+void AArch64FastISel::addLoadStoreOperands(Address &Addr,
                                            const MachineInstrBuilder &MIB,
-                                           unsigned Flags, bool UseUnscaled) {
-  int64_t Offset = Addr.getOffset();
+                                           unsigned Flags,
+                                           unsigned ScaleFactor,
+                                           MachineMemOperand *MMO) {
+  int64_t Offset = Addr.getOffset() / ScaleFactor;
   // Frame base works a bit differently. Handle it separately.
-  if (Addr.getKind() == Address::FrameIndexBase) {
+  if (Addr.isFIBase()) {
     int FI = Addr.getFI();
     // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
     // and alignment should be based on the VT.
-    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(FI, Offset), Flags,
-        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    MMO = FuncInfo.MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(FI, Offset), Flags,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
     // Now add the rest of the operands.
-    MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+    MIB.addFrameIndex(FI).addImm(Offset);
   } else {
-    // Now add the rest of the operands.
-    MIB.addReg(Addr.getReg());
-    MIB.addImm(Offset);
+    assert(Addr.isRegBase() && "Unexpected address kind.");
+    const MCInstrDesc &II = MIB->getDesc();
+    unsigned Idx = (Flags & MachineMemOperand::MOStore) ? 1 : 0;
+    Addr.setReg(
+      constrainOperandRegClass(II, Addr.getReg(), II.getNumDefs()+Idx));
+    Addr.setOffsetReg(
+      constrainOperandRegClass(II, Addr.getOffsetReg(), II.getNumDefs()+Idx+1));
+    if (Addr.getOffsetReg()) {
+      assert(Addr.getOffset() == 0 && "Unexpected offset");
+      bool IsSigned = Addr.getExtendType() == AArch64_AM::SXTW ||
+                      Addr.getExtendType() == AArch64_AM::SXTX;
+      MIB.addReg(Addr.getReg());
+      MIB.addReg(Addr.getOffsetReg());
+      MIB.addImm(IsSigned);
+      MIB.addImm(Addr.getShift() != 0);
+    } else
+      MIB.addReg(Addr.getReg()).addImm(Offset);
   }
+
+  if (MMO)
+    MIB.addMemOperand(MMO);
 }
 
-bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                               bool UseUnscaled) {
-  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
-  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
-  if (!UseUnscaled && Addr.getOffset() < 0)
-    UseUnscaled = true;
+unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+                                     const Value *RHS, bool SetFlags,
+                                     bool WantResult,  bool IsZExt) {
+  AArch64_AM::ShiftExtendType ExtendType = AArch64_AM::InvalidShiftExtend;
+  bool NeedExtend = false;
+  switch (RetVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+    NeedExtend = true;
+    break;
+  case MVT::i8:
+    NeedExtend = true;
+    ExtendType = IsZExt ? AArch64_AM::UXTB : AArch64_AM::SXTB;
+    break;
+  case MVT::i16:
+    NeedExtend = true;
+    ExtendType = IsZExt ? AArch64_AM::UXTH : AArch64_AM::SXTH;
+    break;
+  case MVT::i32:  // fall-through
+  case MVT::i64:
+    break;
+  }
+  MVT SrcVT = RetVT;
+  RetVT.SimpleTy = std::max(RetVT.SimpleTy, MVT::i32);
+
+  // Canonicalize immediates to the RHS first.
+  if (UseAdd && isa<Constant>(LHS) && !isa<Constant>(RHS))
+    std::swap(LHS, RHS);
+
+  // Canonicalize mul by power of 2 to the RHS.
+  if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+    if (isMulPowOf2(LHS))
+      std::swap(LHS, RHS);
+
+  // Canonicalize shift immediate to the RHS.
+  if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+    if (const auto *SI = dyn_cast<BinaryOperator>(LHS))
+      if (isa<ConstantInt>(SI->getOperand(1)))
+        if (SI->getOpcode() == Instruction::Shl  ||
+            SI->getOpcode() == Instruction::LShr ||
+            SI->getOpcode() == Instruction::AShr   )
+          std::swap(LHS, RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return 0;
+  bool LHSIsKill = hasTrivialKill(LHS);
 
-  unsigned Opc;
+  if (NeedExtend)
+    LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt);
+
+  unsigned ResultReg = 0;
+  if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+    uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue();
+    if (C->isNegative())
+      ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm,
+                                SetFlags, WantResult);
+    else
+      ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags,
+                                WantResult);
+  } else if (const auto *C = dyn_cast<Constant>(RHS))
+    if (C->isNullValue())
+      ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags,
+                                WantResult);
+
+  if (ResultReg)
+    return ResultReg;
+
+  // Only extend the RHS within the instruction if there is a valid extend type.
+  if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() &&
+      isValueAvailable(RHS)) {
+    if (const auto *SI = dyn_cast<BinaryOperator>(RHS))
+      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1)))
+        if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) {
+          unsigned RHSReg = getRegForValue(SI->getOperand(0));
+          if (!RHSReg)
+            return 0;
+          bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+          return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+                               RHSIsKill, ExtendType, C->getZExtValue(),
+                               SetFlags, WantResult);
+        }
+    unsigned RHSReg = getRegForValue(RHS);
+    if (!RHSReg)
+      return 0;
+    bool RHSIsKill = hasTrivialKill(RHS);
+    return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                         ExtendType, 0, SetFlags, WantResult);
+  }
+
+  // Check if the mul can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS))
+    if (isMulPowOf2(RHS)) {
+      const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+      const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+      if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+        if (C->getValue().isPowerOf2())
+          std::swap(MulLHS, MulRHS);
+
+      assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+      uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+      unsigned RHSReg = getRegForValue(MulLHS);
+      if (!RHSReg)
+        return 0;
+      bool RHSIsKill = hasTrivialKill(MulLHS);
+      return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                           AArch64_AM::LSL, ShiftVal, SetFlags, WantResult);
+    }
+
+  // Check if the shift can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS))
+    if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) {
+      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+        AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend;
+        switch (SI->getOpcode()) {
+        default: break;
+        case Instruction::Shl:  ShiftType = AArch64_AM::LSL; break;
+        case Instruction::LShr: ShiftType = AArch64_AM::LSR; break;
+        case Instruction::AShr: ShiftType = AArch64_AM::ASR; break;
+        }
+        uint64_t ShiftVal = C->getZExtValue();
+        if (ShiftType != AArch64_AM::InvalidShiftExtend) {
+          unsigned RHSReg = getRegForValue(SI->getOperand(0));
+          if (!RHSReg)
+            return 0;
+          bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+          return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+                               RHSIsKill, ShiftType, ShiftVal, SetFlags,
+                               WantResult);
+        }
+      }
+    }
+
+  unsigned RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return 0;
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  if (NeedExtend)
+    RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt);
+
+  return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                       SetFlags, WantResult);
+}
+
+unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, unsigned RHSReg,
+                                        bool RHSIsKill, bool SetFlags,
+                                        bool WantResult) {
+  assert(LHSReg && RHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWrr,  AArch64::SUBXrr  },
+      { AArch64::ADDWrr,  AArch64::ADDXrr  }  },
+    { { AArch64::SUBSWrr, AArch64::SUBSXrr },
+      { AArch64::ADDSWrr, AArch64::ADDSXrr }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, uint64_t Imm,
+                                        bool SetFlags, bool WantResult) {
+  assert(LHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  unsigned ShiftImm;
+  if (isUInt<12>(Imm))
+    ShiftImm = 0;
+  else if ((Imm & 0xfff000) == Imm) {
+    ShiftImm = 12;
+    Imm >>= 12;
+  } else
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWri,  AArch64::SUBXri  },
+      { AArch64::ADDWri,  AArch64::ADDXri  }  },
+    { { AArch64::SUBSWri, AArch64::SUBSXri },
+      { AArch64::ADDSWri, AArch64::ADDSXri }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
   const TargetRegisterClass *RC;
-  bool VTIsi1 = false;
-  int64_t ScaleFactor = 0;
+  if (SetFlags)
+    RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  else
+    RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addImm(Imm)
+      .addImm(getShifterImm(AArch64_AM::LSL, ShiftImm));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, unsigned RHSReg,
+                                        bool RHSIsKill,
+                                        AArch64_AM::ShiftExtendType ShiftType,
+                                        uint64_t ShiftImm, bool SetFlags,
+                                        bool WantResult) {
+  assert(LHSReg && RHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWrs,  AArch64::SUBXrs  },
+      { AArch64::ADDWrs,  AArch64::ADDXrs  }  },
+    { { AArch64::SUBSWrs, AArch64::SUBSXrs },
+      { AArch64::ADDSWrs, AArch64::ADDSXrs }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill))
+      .addImm(getShifterImm(ShiftType, ShiftImm));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, unsigned RHSReg,
+                                        bool RHSIsKill,
+                                        AArch64_AM::ShiftExtendType ExtType,
+                                        uint64_t ShiftImm, bool SetFlags,
+                                        bool WantResult) {
+  assert(LHSReg && RHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWrx,  AArch64::SUBXrx  },
+      { AArch64::ADDWrx,  AArch64::ADDXrx  }  },
+    { { AArch64::SUBSWrx, AArch64::SUBSXrx },
+      { AArch64::ADDSWrx, AArch64::ADDSXrx }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC = nullptr;
+  if (SetFlags)
+    RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  else
+    RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill))
+      .addImm(getArithExtendImm(ExtType, ShiftImm));
+  return ResultReg;
+}
+
+bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) {
+  Type *Ty = LHS->getType();
+  EVT EVT = TLI.getValueType(Ty, true);
+  if (!EVT.isSimple())
+    return false;
+  MVT VT = EVT.getSimpleVT();
+
   switch (VT.SimpleTy) {
   default:
     return false;
   case MVT::i1:
-    VTIsi1 = true;
-  // Intentional fall-through.
   case MVT::i8:
-    Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui;
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+    return emitICmp(VT, LHS, RHS, IsZExt);
+  case MVT::f32:
+  case MVT::f64:
+    return emitFCmp(VT, LHS, RHS);
+  }
+}
+
+bool AArch64FastISel::emitICmp(MVT RetVT, const Value *LHS, const Value *RHS,
+                               bool IsZExt) {
+  return emitSub(RetVT, LHS, RHS, /*SetFlags=*/true, /*WantResult=*/false,
+                 IsZExt) != 0;
+}
+
+bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                                  uint64_t Imm) {
+  return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm,
+                       /*SetFlags=*/true, /*WantResult=*/false) != 0;
+}
+
+bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) {
+  if (RetVT != MVT::f32 && RetVT != MVT::f64)
+    return false;
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  bool UseImm = false;
+  if (const auto *CFP = dyn_cast<ConstantFP>(RHS))
+    if (CFP->isZero() && !CFP->isNegative())
+      UseImm = true;
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return false;
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  if (UseImm) {
+    unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+        .addReg(LHSReg, getKillRegState(LHSIsKill));
+    return true;
+  }
+
+  unsigned RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return false;
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill));
+  return true;
+}
+
+unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+                                  bool SetFlags, bool WantResult, bool IsZExt) {
+  return emitAddSub(/*UseAdd=*/true, RetVT, LHS, RHS, SetFlags, WantResult,
+                    IsZExt);
+}
+
+/// \brief This method is a wrapper to simplify add emission.
+///
+/// First try to emit an add with an immediate operand using emitAddSub_ri. If
+/// that fails, then try to materialize the immediate into a register and use
+/// emitAddSub_rr instead.
+unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill,
+                                      int64_t Imm) {
+  unsigned ResultReg;
+  if (Imm < 0)
+    ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm);
+  else
+    ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm);
+
+  if (ResultReg)
+    return ResultReg;
+
+  unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm);
+  if (!CReg)
+    return 0;
+
+  ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+                                  bool SetFlags, bool WantResult, bool IsZExt) {
+  return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult,
+                    IsZExt);
+}
+
+unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg,
+                                      bool LHSIsKill, unsigned RHSReg,
+                                      bool RHSIsKill, bool WantResult) {
+  return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+                       RHSIsKill, /*SetFlags=*/true, WantResult);
+}
+
+unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg,
+                                      bool LHSIsKill, unsigned RHSReg,
+                                      bool RHSIsKill,
+                                      AArch64_AM::ShiftExtendType ShiftType,
+                                      uint64_t ShiftImm, bool WantResult) {
+  return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+                       RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true,
+                       WantResult);
+}
+
+unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
+                                        const Value *LHS, const Value *RHS) {
+  // Canonicalize immediates to the RHS first.
+  if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS))
+    std::swap(LHS, RHS);
+
+  // Canonicalize mul by power-of-2 to the RHS.
+  if (LHS->hasOneUse() && isValueAvailable(LHS))
+    if (isMulPowOf2(LHS))
+      std::swap(LHS, RHS);
+
+  // Canonicalize shift immediate to the RHS.
+  if (LHS->hasOneUse() && isValueAvailable(LHS))
+    if (const auto *SI = dyn_cast<ShlOperator>(LHS))
+      if (isa<ConstantInt>(SI->getOperand(1)))
+        std::swap(LHS, RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return 0;
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned ResultReg = 0;
+  if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+    uint64_t Imm = C->getZExtValue();
+    ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm);
+  }
+  if (ResultReg)
+    return ResultReg;
+
+  // Check if the mul can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS))
+    if (isMulPowOf2(RHS)) {
+      const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+      const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+      if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+        if (C->getValue().isPowerOf2())
+          std::swap(MulLHS, MulRHS);
+
+      assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+      uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+
+      unsigned RHSReg = getRegForValue(MulLHS);
+      if (!RHSReg)
+        return 0;
+      bool RHSIsKill = hasTrivialKill(MulLHS);
+      return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+                              RHSIsKill, ShiftVal);
+    }
+
+  // Check if the shift can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS))
+    if (const auto *SI = dyn_cast<ShlOperator>(RHS))
+      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+        uint64_t ShiftVal = C->getZExtValue();
+        unsigned RHSReg = getRegForValue(SI->getOperand(0));
+        if (!RHSReg)
+          return 0;
+        bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+        return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                RHSIsKill, ShiftVal);
+      }
+
+  unsigned RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return 0;
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  MVT VT = std::max(MVT::i32, RetVT.SimpleTy);
+  ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+  if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+    uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
+                                           unsigned LHSReg, bool LHSIsKill,
+                                           uint64_t Imm) {
+  assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
+         "ISD nodes are not consecutive!");
+  static const unsigned OpcTable[3][2] = {
+    { AArch64::ANDWri, AArch64::ANDXri },
+    { AArch64::ORRWri, AArch64::ORRXri },
+    { AArch64::EORWri, AArch64::EORXri }
+  };
+  const TargetRegisterClass *RC;
+  unsigned Opc;
+  unsigned RegSize;
+  switch (RetVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32: {
+    unsigned Idx = ISDOpc - ISD::AND;
+    Opc = OpcTable[Idx][0];
+    RC = &AArch64::GPR32spRegClass;
+    RegSize = 32;
+    break;
+  }
+  case MVT::i64:
+    Opc = OpcTable[ISDOpc - ISD::AND][1];
+    RC = &AArch64::GPR64spRegClass;
+    RegSize = 64;
+    break;
+  }
+
+  if (!AArch64_AM::isLogicalImmediate(Imm, RegSize))
+    return 0;
+
+  unsigned ResultReg =
+      fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill,
+                      AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
+  if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) {
+    uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
+                                           unsigned LHSReg, bool LHSIsKill,
+                                           unsigned RHSReg, bool RHSIsKill,
+                                           uint64_t ShiftImm) {
+  assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
+         "ISD nodes are not consecutive!");
+  static const unsigned OpcTable[3][2] = {
+    { AArch64::ANDWrs, AArch64::ANDXrs },
+    { AArch64::ORRWrs, AArch64::ORRXrs },
+    { AArch64::EORWrs, AArch64::EORXrs }
+  };
+  const TargetRegisterClass *RC;
+  unsigned Opc;
+  switch (RetVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    Opc = OpcTable[ISDOpc - ISD::AND][0];
     RC = &AArch64::GPR32RegClass;
+    break;
+  case MVT::i64:
+    Opc = OpcTable[ISDOpc - ISD::AND][1];
+    RC = &AArch64::GPR64RegClass;
+    break;
+  }
+  unsigned ResultReg =
+      fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                       AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm));
+  if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+    uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                                     uint64_t Imm) {
+  return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm);
+}
+
+unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr,
+                                   bool WantZExt, MachineMemOperand *MMO) {
+  // Simplify this down to something we can handle.
+  if (!simplifyAddress(Addr, VT))
+    return 0;
+
+  unsigned ScaleFactor = getImplicitScaleFactor(VT);
+  if (!ScaleFactor)
+    llvm_unreachable("Unexpected value type.");
+
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  bool UseScaled = true;
+  if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+    UseScaled = false;
     ScaleFactor = 1;
+  }
+
+  static const unsigned GPOpcTable[2][8][4] = {
+    // Sign-extend.
+    { { AArch64::LDURSBWi,  AArch64::LDURSHWi,  AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDURSBXi,  AArch64::LDURSHXi,  AArch64::LDURSWi,
+        AArch64::LDURXi  },
+      { AArch64::LDRSBWui,  AArch64::LDRSHWui,  AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRSBXui,  AArch64::LDRSHXui,  AArch64::LDRSWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW,
+        AArch64::LDRXroW },
+      { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW,
+        AArch64::LDRXroW }
+    },
+    // Zero-extend.
+    { { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
+        AArch64::LDRXroW },
+      { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
+        AArch64::LDRXroW }
+    }
+  };
+
+  static const unsigned FPOpcTable[4][2] = {
+    { AArch64::LDURSi,  AArch64::LDURDi  },
+    { AArch64::LDRSui,  AArch64::LDRDui  },
+    { AArch64::LDRSroX, AArch64::LDRDroX },
+    { AArch64::LDRSroW, AArch64::LDRDroW }
+  };
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+                      Addr.getOffsetReg();
+  unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+  if (Addr.getExtendType() == AArch64_AM::UXTW ||
+      Addr.getExtendType() == AArch64_AM::SXTW)
+    Idx++;
+
+  bool IsRet64Bit = RetVT == MVT::i64;
+  switch (VT.SimpleTy) {
+  default:
+    llvm_unreachable("Unexpected value type.");
+  case MVT::i1: // Intentional fall-through.
+  case MVT::i8:
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i16:
-    Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui;
-    RC = &AArch64::GPR32RegClass;
-    ScaleFactor = 2;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i32:
-    Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui;
-    RC = &AArch64::GPR32RegClass;
-    ScaleFactor = 4;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i64:
-    Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3];
     RC = &AArch64::GPR64RegClass;
-    ScaleFactor = 8;
     break;
   case MVT::f32:
-    Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui;
-    RC = TLI.getRegClassFor(VT);
-    ScaleFactor = 4;
+    Opc = FPOpcTable[Idx][0];
+    RC = &AArch64::FPR32RegClass;
     break;
   case MVT::f64:
-    Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui;
-    RC = TLI.getRegClassFor(VT);
-    ScaleFactor = 8;
+    Opc = FPOpcTable[Idx][1];
+    RC = &AArch64::FPR64RegClass;
     break;
   }
-  // Scale the offset.
-  if (!UseUnscaled) {
-    int64_t Offset = Addr.getOffset();
-    if (Offset & (ScaleFactor - 1))
-      // Retry using an unscaled, 9-bit, signed immediate offset.
-      return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
-
-    Addr.setOffset(Offset / ScaleFactor);
-  }
-
-  // Simplify this down to something we can handle.
-  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
-    return false;
 
   // Create the base instruction, then add the operands.
-  ResultReg = createResultReg(RC);
+  unsigned ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(Opc), ResultReg);
-  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
+  addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO);
 
   // Loading an i1 requires special handling.
-  if (VTIsi1) {
-    MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass);
-    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-            ANDReg)
-        .addReg(ResultReg)
-        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+  if (VT == MVT::i1) {
+    unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1);
+    assert(ANDReg && "Unexpected AND instruction emission failure.");
     ResultReg = ANDReg;
   }
+
+  // For zero-extending loads to 64bit we emit a 32bit load and then convert
+  // the 32bit reg to a 64bit reg.
+  if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) {
+    unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Reg64)
+        .addImm(0)
+        .addReg(ResultReg, getKillRegState(true))
+        .addImm(AArch64::sub_32);
+    ResultReg = Reg64;
+  }
+  return ResultReg;
+}
+
+bool AArch64FastISel::selectAddSub(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (VT.isVector())
+    return selectOperator(I, I->getOpcode());
+
+  unsigned ResultReg;
+  switch (I->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  case Instruction::Add:
+    ResultReg = emitAdd(VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Sub:
+    ResultReg = emitSub(VT, I->getOperand(0), I->getOperand(1));
+    break;
+  }
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectLogicalOp(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (VT.isVector())
+    return selectOperator(I, I->getOpcode());
+
+  unsigned ResultReg;
+  switch (I->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  case Instruction::And:
+    ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Or:
+    ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Xor:
+    ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  }
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectLoad(const Instruction *I) {
+bool AArch64FastISel::selectLoad(const Instruction *I) {
   MVT VT;
   // Verify we have a legal type before going any further.  Currently, we handle
   // simple types that will directly fit in a register (i32/f32/i64/f64) or
   // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
-  if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true) ||
+      cast<LoadInst>(I)->isAtomic())
     return false;
 
   // See if we can handle this address.
   Address Addr;
-  if (!ComputeAddress(I->getOperand(0), Addr))
+  if (!computeAddress(I->getOperand(0), Addr, I->getType()))
     return false;
 
-  unsigned ResultReg;
-  if (!EmitLoad(VT, ResultReg, Addr))
+  // Fold the following sign-/zero-extend into the load instruction.
+  bool WantZExt = true;
+  MVT RetVT = VT;
+  const Value *IntExtVal = nullptr;
+  if (I->hasOneUse()) {
+    if (const auto *ZE = dyn_cast<ZExtInst>(I->use_begin()->getUser())) {
+      if (isTypeSupported(ZE->getType(), RetVT))
+        IntExtVal = ZE;
+      else
+        RetVT = VT;
+    } else if (const auto *SE = dyn_cast<SExtInst>(I->use_begin()->getUser())) {
+      if (isTypeSupported(SE->getType(), RetVT))
+        IntExtVal = SE;
+      else
+        RetVT = VT;
+      WantZExt = false;
+    }
+  }
+
+  unsigned ResultReg =
+      emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I));
+  if (!ResultReg)
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  // There are a few different cases we have to handle, because the load or the
+  // sign-/zero-extend might not be selected by FastISel if we fall-back to
+  // SelectionDAG. There is also an ordering issue when both instructions are in
+  // different basic blocks.
+  // 1.) The load instruction is selected by FastISel, but the integer extend
+  //     not. This usually happens when the integer extend is in a different
+  //     basic block and SelectionDAG took over for that basic block.
+  // 2.) The load instruction is selected before the integer extend. This only
+  //     happens when the integer extend is in a different basic block.
+  // 3.) The load instruction is selected by SelectionDAG and the integer extend
+  //     by FastISel. This happens if there are instructions between the load
+  //     and the integer extend that couldn't be selected by FastISel.
+  if (IntExtVal) {
+    // The integer extend hasn't been emitted yet. FastISel or SelectionDAG
+    // could select it. Emit a copy to subreg if necessary. FastISel will remove
+    // it when it selects the integer extend.
+    unsigned Reg = lookUpRegForValue(IntExtVal);
+    if (!Reg) {
+      if (RetVT == MVT::i64 && VT <= MVT::i32) {
+        if (WantZExt) {
+          // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG).
+          std::prev(FuncInfo.InsertPt)->eraseFromParent();
+          ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg();
+        } else
+          ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg,
+                                                 /*IsKill=*/true,
+                                                 AArch64::sub_32);
+      }
+      updateValueMap(I, ResultReg);
+      return true;
+    }
+
+    // The integer extend has already been emitted - delete all the instructions
+    // that have been emitted by the integer extend lowering code and use the
+    // result from the load instruction directly.
+    while (Reg) {
+      auto *MI = MRI.getUniqueVRegDef(Reg);
+      if (!MI)
+        break;
+      Reg = 0;
+      for (auto &Opnd : MI->uses()) {
+        if (Opnd.isReg()) {
+          Reg = Opnd.getReg();
+          break;
+        }
+      }
+      MI->eraseFromParent();
+    }
+    updateValueMap(IntExtVal, ResultReg);
+    return true;
+  }
+
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                                bool UseUnscaled) {
+bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr,
+                                MachineMemOperand *MMO) {
+  // Simplify this down to something we can handle.
+  if (!simplifyAddress(Addr, VT))
+    return false;
+
+  unsigned ScaleFactor = getImplicitScaleFactor(VT);
+  if (!ScaleFactor)
+    llvm_unreachable("Unexpected value type.");
+
   // Negative offsets require unscaled, 9-bit, signed immediate offsets.
   // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
-  if (!UseUnscaled && Addr.getOffset() < 0)
-    UseUnscaled = true;
-
-  unsigned StrOpc;
-  bool VTIsi1 = false;
-  int64_t ScaleFactor = 0;
-  // Using scaled, 12-bit, unsigned immediate offsets.
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-    VTIsi1 = true;
-  case MVT::i8:
-    StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui;
+  bool UseScaled = true;
+  if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+    UseScaled = false;
     ScaleFactor = 1;
-    break;
-  case MVT::i16:
-    StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui;
-    ScaleFactor = 2;
-    break;
-  case MVT::i32:
-    StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui;
-    ScaleFactor = 4;
-    break;
-  case MVT::i64:
-    StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui;
-    ScaleFactor = 8;
-    break;
-  case MVT::f32:
-    StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui;
-    ScaleFactor = 4;
-    break;
-  case MVT::f64:
-    StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui;
-    ScaleFactor = 8;
-    break;
   }
-  // Scale the offset.
-  if (!UseUnscaled) {
-    int64_t Offset = Addr.getOffset();
-    if (Offset & (ScaleFactor - 1))
-      // Retry using an unscaled, 9-bit, signed immediate offset.
-      return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
 
-    Addr.setOffset(Offset / ScaleFactor);
-  }
+  static const unsigned OpcTable[4][6] = {
+    { AArch64::STURBBi,  AArch64::STURHHi,  AArch64::STURWi,  AArch64::STURXi,
+      AArch64::STURSi,   AArch64::STURDi },
+    { AArch64::STRBBui,  AArch64::STRHHui,  AArch64::STRWui,  AArch64::STRXui,
+      AArch64::STRSui,   AArch64::STRDui },
+    { AArch64::STRBBroX, AArch64::STRHHroX, AArch64::STRWroX, AArch64::STRXroX,
+      AArch64::STRSroX,  AArch64::STRDroX },
+    { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW,
+      AArch64::STRSroW,  AArch64::STRDroW }
+  };
 
-  // Simplify this down to something we can handle.
-  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
-    return false;
+  unsigned Opc;
+  bool VTIsi1 = false;
+  bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+                      Addr.getOffsetReg();
+  unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+  if (Addr.getExtendType() == AArch64_AM::UXTW ||
+      Addr.getExtendType() == AArch64_AM::SXTW)
+    Idx++;
+
+  switch (VT.SimpleTy) {
+  default: llvm_unreachable("Unexpected value type.");
+  case MVT::i1:  VTIsi1 = true;
+  case MVT::i8:  Opc = OpcTable[Idx][0]; break;
+  case MVT::i16: Opc = OpcTable[Idx][1]; break;
+  case MVT::i32: Opc = OpcTable[Idx][2]; break;
+  case MVT::i64: Opc = OpcTable[Idx][3]; break;
+  case MVT::f32: Opc = OpcTable[Idx][4]; break;
+  case MVT::f64: Opc = OpcTable[Idx][5]; break;
+  }
 
   // Storing an i1 requires special handling.
-  if (VTIsi1) {
-    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
-    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-            ANDReg)
-        .addReg(SrcReg)
-        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+  if (VTIsi1 && SrcReg != AArch64::WZR) {
+    unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+    assert(ANDReg && "Unexpected AND instruction emission failure.");
     SrcReg = ANDReg;
   }
   // Create the base instruction, then add the operands.
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(StrOpc)).addReg(SrcReg);
-  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
+  const MCInstrDesc &II = TII.get(Opc);
+  SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(SrcReg);
+  addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, ScaleFactor, MMO);
+
   return true;
 }
 
-bool AArch64FastISel::SelectStore(const Instruction *I) {
+bool AArch64FastISel::selectStore(const Instruction *I) {
   MVT VT;
-  Value *Op0 = I->getOperand(0);
+  const Value *Op0 = I->getOperand(0);
   // Verify we have a legal type before going any further.  Currently, we handle
   // simple types that will directly fit in a register (i32/f32/i64/f64) or
   // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
-  if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
+  if (!isTypeSupported(Op0->getType(), VT, /*IsVectorAllowed=*/true) ||
       cast<StoreInst>(I)->isAtomic())
     return false;
 
-  // Get the value to be stored into a register.
-  unsigned SrcReg = getRegForValue(Op0);
-  if (SrcReg == 0)
+  // Get the value to be stored into a register. Use the zero register directly
+  // when possible to avoid an unnecessary copy and a wasted register.
+  unsigned SrcReg = 0;
+  if (const auto *CI = dyn_cast<ConstantInt>(Op0)) {
+    if (CI->isZero())
+      SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+  } else if (const auto *CF = dyn_cast<ConstantFP>(Op0)) {
+    if (CF->isZero() && !CF->isNegative()) {
+      VT = MVT::getIntegerVT(VT.getSizeInBits());
+      SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+    }
+  }
+
+  if (!SrcReg)
+    SrcReg = getRegForValue(Op0);
+
+  if (!SrcReg)
     return false;
 
   // See if we can handle this address.
   Address Addr;
-  if (!ComputeAddress(I->getOperand(1), Addr))
+  if (!computeAddress(I->getOperand(1), Addr, I->getOperand(0)->getType()))
     return false;
 
-  if (!EmitStore(VT, SrcReg, Addr))
+  if (!emitStore(VT, SrcReg, Addr, createMachineMemOperandFor(I)))
     return false;
   return true;
 }
@@ -757,58 +2103,235 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
   }
 }
 
-bool AArch64FastISel::SelectBranch(const Instruction *I) {
+/// \brief Try to emit a combined compare-and-branch instruction.
+bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
+  assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
+  const CmpInst *CI = cast<CmpInst>(BI->getCondition());
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+  const Value *LHS = CI->getOperand(0);
+  const Value *RHS = CI->getOperand(1);
+
+  MVT VT;
+  if (!isTypeSupported(LHS->getType(), VT))
+    return false;
+
+  unsigned BW = VT.getSizeInBits();
+  if (BW > 64)
+    return false;
+
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // Try to take advantage of fallthrough opportunities.
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    Predicate = CmpInst::getInversePredicate(Predicate);
+  }
+
+  int TestBit = -1;
+  bool IsCmpNE;
+  switch (Predicate) {
+  default:
+    return false;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::ICMP_NE:
+    if (isa<Constant>(LHS) && cast<Constant>(LHS)->isNullValue())
+      std::swap(LHS, RHS);
+
+    if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
+      return false;
+
+    if (const auto *AI = dyn_cast<BinaryOperator>(LHS))
+      if (AI->getOpcode() == Instruction::And && isValueAvailable(AI)) {
+        const Value *AndLHS = AI->getOperand(0);
+        const Value *AndRHS = AI->getOperand(1);
+
+        if (const auto *C = dyn_cast<ConstantInt>(AndLHS))
+          if (C->getValue().isPowerOf2())
+            std::swap(AndLHS, AndRHS);
+
+        if (const auto *C = dyn_cast<ConstantInt>(AndRHS))
+          if (C->getValue().isPowerOf2()) {
+            TestBit = C->getValue().logBase2();
+            LHS = AndLHS;
+          }
+      }
+
+    if (VT == MVT::i1)
+      TestBit = 0;
+
+    IsCmpNE = Predicate == CmpInst::ICMP_NE;
+    break;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::ICMP_SGE:
+    if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
+      return false;
+
+    TestBit = BW - 1;
+    IsCmpNE = Predicate == CmpInst::ICMP_SLT;
+    break;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::ICMP_SLE:
+    if (!isa<ConstantInt>(RHS))
+      return false;
+
+    if (cast<ConstantInt>(RHS)->getValue() != APInt(BW, -1, true))
+      return false;
+
+    TestBit = BW - 1;
+    IsCmpNE = Predicate == CmpInst::ICMP_SLE;
+    break;
+  } // end switch
+
+  static const unsigned OpcTable[2][2][2] = {
+    { {AArch64::CBZW,  AArch64::CBZX },
+      {AArch64::CBNZW, AArch64::CBNZX} },
+    { {AArch64::TBZW,  AArch64::TBZX },
+      {AArch64::TBNZW, AArch64::TBNZX} }
+  };
+
+  bool IsBitTest = TestBit != -1;
+  bool Is64Bit = BW == 64;
+  if (TestBit < 32 && TestBit >= 0)
+    Is64Bit = false;
+
+  unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit];
+  const MCInstrDesc &II = TII.get(Opc);
+
+  unsigned SrcReg = getRegForValue(LHS);
+  if (!SrcReg)
+    return false;
+  bool SrcIsKill = hasTrivialKill(LHS);
+
+  if (BW == 64 && !Is64Bit)
+    SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
+                                        AArch64::sub_32);
+
+  if ((BW < 32) && !IsBitTest)
+    SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true);
+
+  // Emit the combined compare and branch instruction.
+  SrcReg = constrainOperandRegClass(II, SrcReg,  II.getNumDefs());
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+          .addReg(SrcReg, getKillRegState(SrcIsKill));
+  if (IsBitTest)
+    MIB.addImm(TestBit);
+  MIB.addMBB(TBB);
+
+  // Obtain the branch weight and add the TrueBB to the successor list.
+  uint32_t BranchWeight = 0;
+  if (FuncInfo.BPI)
+    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                               TBB->getBasicBlock());
+  FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+  fastEmitBranch(FBB, DbgLoc);
+
+  return true;
+}
+
+bool AArch64FastISel::selectBranch(const Instruction *I) {
   const BranchInst *BI = cast<BranchInst>(I);
+  if (BI->isUnconditional()) {
+    MachineBasicBlock *MSucc = FuncInfo.MBBMap[BI->getSuccessor(0)];
+    fastEmitBranch(MSucc, BI->getDebugLoc());
+    return true;
+  }
+
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
+  AArch64CC::CondCode CC = AArch64CC::NE;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
-    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
-      // We may not handle every CC for now.
-      AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
-      if (CC == AArch64CC::AL)
-        return false;
+    if (CI->hasOneUse() && isValueAvailable(CI)) {
+      // Try to optimize or fold the cmp.
+      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+      switch (Predicate) {
+      default:
+        break;
+      case CmpInst::FCMP_FALSE:
+        fastEmitBranch(FBB, DbgLoc);
+        return true;
+      case CmpInst::FCMP_TRUE:
+        fastEmitBranch(TBB, DbgLoc);
+        return true;
+      }
+
+      // Try to emit a combined compare-and-branch first.
+      if (emitCompareAndBranch(BI))
+        return true;
+
+      // Try to take advantage of fallthrough opportunities.
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
 
       // Emit the cmp.
-      if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+      if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
         return false;
 
+      // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
+      // instruction.
+      CC = getCompareCC(Predicate);
+      AArch64CC::CondCode ExtraCC = AArch64CC::AL;
+      switch (Predicate) {
+      default:
+        break;
+      case CmpInst::FCMP_UEQ:
+        ExtraCC = AArch64CC::EQ;
+        CC = AArch64CC::VS;
+        break;
+      case CmpInst::FCMP_ONE:
+        ExtraCC = AArch64CC::MI;
+        CC = AArch64CC::GT;
+        break;
+      }
+      assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+
+      // Emit the extra branch for FCMP_UEQ and FCMP_ONE.
+      if (ExtraCC != AArch64CC::AL) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+            .addImm(ExtraCC)
+            .addMBB(TBB);
+      }
+
       // Emit the branch.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
           .addImm(CC)
           .addMBB(TBB);
-      FuncInfo.MBB->addSuccessor(TBB);
 
-      FastEmitBranch(FBB, DbgLoc);
+      // Obtain the branch weight and add the TrueBB to the successor list.
+      uint32_t BranchWeight = 0;
+      if (FuncInfo.BPI)
+        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                  TBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+      fastEmitBranch(FBB, DbgLoc);
       return true;
     }
   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
     MVT SrcVT;
-    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
-        (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
+    if (TI->hasOneUse() && isValueAvailable(TI) &&
+        isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) {
       unsigned CondReg = getRegForValue(TI->getOperand(0));
-      if (CondReg == 0)
+      if (!CondReg)
         return false;
+      bool CondIsKill = hasTrivialKill(TI->getOperand(0));
 
       // Issue an extract_subreg to get the lower 32-bits.
-      if (SrcVT == MVT::i64)
-        CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
+      if (SrcVT == MVT::i64) {
+        CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill,
                                              AArch64::sub_32);
+        CondIsKill = true;
+      }
 
-      MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
-      unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(AArch64::ANDWri), ANDReg)
-          .addReg(CondReg)
-          .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(AArch64::SUBSWri))
-          .addReg(ANDReg)
-          .addReg(ANDReg)
-          .addImm(0)
-          .addImm(0);
+      unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1);
+      assert(ANDReg && "Unexpected AND instruction emission failure.");
+      emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
 
-      unsigned CC = AArch64CC::NE;
       if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
         std::swap(TBB, FBB);
         CC = AArch64CC::EQ;
@@ -816,23 +2339,57 @@ bool AArch64FastISel::SelectBranch(const Instruction *I) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
           .addImm(CC)
           .addMBB(TBB);
-      FuncInfo.MBB->addSuccessor(TBB);
-      FastEmitBranch(FBB, DbgLoc);
+
+      // Obtain the branch weight and add the TrueBB to the successor list.
+      uint32_t BranchWeight = 0;
+      if (FuncInfo.BPI)
+        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                  TBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+      fastEmitBranch(FBB, DbgLoc);
       return true;
     }
-  } else if (const ConstantInt *CI =
-                 dyn_cast<ConstantInt>(BI->getCondition())) {
+  } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
     uint64_t Imm = CI->getZExtValue();
     MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
         .addMBB(Target);
-    FuncInfo.MBB->addSuccessor(Target);
+
+    // Obtain the branch weight and add the target to the successor list.
+    uint32_t BranchWeight = 0;
+    if (FuncInfo.BPI)
+      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                 Target->getBasicBlock());
+    FuncInfo.MBB->addSuccessor(Target, BranchWeight);
+    return true;
+  } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned CondReg = getRegForValue(BI->getCondition());
+    if (!CondReg)
+      return false;
+
+    // Emit the branch.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+      .addImm(CC)
+      .addMBB(TBB);
+
+    // Obtain the branch weight and add the TrueBB to the successor list.
+    uint32_t BranchWeight = 0;
+    if (FuncInfo.BPI)
+      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                 TBB->getBasicBlock());
+    FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+    fastEmitBranch(FBB, DbgLoc);
     return true;
   }
 
   unsigned CondReg = getRegForValue(BI->getCondition());
   if (CondReg == 0)
     return false;
+  bool CondRegIsKill = hasTrivialKill(BI->getCondition());
 
   // We've been divorced from our compare!  Our block was split, and
   // now our compare lives in a predecessor block.  We musn't
@@ -841,13 +2398,8 @@ bool AArch64FastISel::SelectBranch(const Instruction *I) {
   // Regardless, the compare has been done in the predecessor block,
   // and it left a value for us in a virtual register.  Ergo, we test
   // the one-bit value left in the virtual register.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri),
-          AArch64::WZR)
-      .addReg(CondReg)
-      .addImm(0)
-      .addImm(0);
+  emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0);
 
-  unsigned CC = AArch64CC::NE;
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
     std::swap(TBB, FBB);
     CC = AArch64CC::EQ;
@@ -856,20 +2408,28 @@ bool AArch64FastISel::SelectBranch(const Instruction *I) {
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
       .addImm(CC)
       .addMBB(TBB);
-  FuncInfo.MBB->addSuccessor(TBB);
-  FastEmitBranch(FBB, DbgLoc);
+
+  // Obtain the branch weight and add the TrueBB to the successor list.
+  uint32_t BranchWeight = 0;
+  if (FuncInfo.BPI)
+    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                               TBB->getBasicBlock());
+  FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+  fastEmitBranch(FBB, DbgLoc);
   return true;
 }
 
-bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
+bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
   const IndirectBrInst *BI = cast<IndirectBrInst>(I);
   unsigned AddrReg = getRegForValue(BI->getOperand(0));
   if (AddrReg == 0)
     return false;
 
   // Emit the indirect branch.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR))
-      .addReg(AddrReg);
+  const MCInstrDesc &II = TII.get(AArch64::BR);
+  AddrReg = constrainOperandRegClass(II, AddrReg,  II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg);
 
   // Make sure the CFG is up-to-date.
   for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
@@ -878,211 +2438,271 @@ bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
   return true;
 }
 
-bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
-  Type *Ty = Src1Value->getType();
-  EVT SrcEVT = TLI.getValueType(Ty, true);
-  if (!SrcEVT.isSimple())
-    return false;
-  MVT SrcVT = SrcEVT.getSimpleVT();
-
-  // Check to see if the 2nd operand is a constant that we can encode directly
-  // in the compare.
-  uint64_t Imm;
-  bool UseImm = false;
-  bool isNegativeImm = false;
-  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
-    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
-        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
-      const APInt &CIVal = ConstInt->getValue();
-
-      Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
-      if (CIVal.isNegative()) {
-        isNegativeImm = true;
-        Imm = -Imm;
-      }
-      // FIXME: We can handle more immediates using shifts.
-      UseImm = ((Imm & 0xfff) == Imm);
-    }
-  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
-    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
-      if (ConstFP->isZero() && !ConstFP->isNegative())
-        UseImm = true;
-  }
+bool AArch64FastISel::selectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
 
-  unsigned ZReg;
-  unsigned CmpOpc;
-  bool isICmp = true;
-  bool needsExt = false;
-  switch (SrcVT.SimpleTy) {
+  // Try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+  unsigned ResultReg = 0;
+  switch (Predicate) {
   default:
-    return false;
-  case MVT::i1:
-  case MVT::i8:
-  case MVT::i16:
-    needsExt = true;
-  // Intentional fall-through.
-  case MVT::i32:
-    ZReg = AArch64::WZR;
-    if (UseImm)
-      CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri;
-    else
-      CmpOpc = AArch64::SUBSWrr;
     break;
-  case MVT::i64:
-    ZReg = AArch64::XZR;
-    if (UseImm)
-      CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri;
-    else
-      CmpOpc = AArch64::SUBSXrr;
-    break;
-  case MVT::f32:
-    isICmp = false;
-    CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr;
+  case CmpInst::FCMP_FALSE:
+    ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(AArch64::WZR, getKillRegState(true));
     break;
-  case MVT::f64:
-    isICmp = false;
-    CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr;
+  case CmpInst::FCMP_TRUE:
+    ResultReg = fastEmit_i(MVT::i32, MVT::i32, ISD::Constant, 1);
     break;
   }
 
-  unsigned SrcReg1 = getRegForValue(Src1Value);
-  if (SrcReg1 == 0)
-    return false;
-
-  unsigned SrcReg2;
-  if (!UseImm) {
-    SrcReg2 = getRegForValue(Src2Value);
-    if (SrcReg2 == 0)
-      return false;
+  if (ResultReg) {
+    updateValueMap(I, ResultReg);
+    return true;
   }
 
-  // We have i1, i8, or i16, we need to either zero extend or sign extend.
-  if (needsExt) {
-    SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
-    if (SrcReg1 == 0)
-      return false;
-    if (!UseImm) {
-      SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
-      if (SrcReg2 == 0)
-        return false;
-    }
-  }
+  // Emit the cmp.
+  if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
 
-  if (isICmp) {
-    if (UseImm)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(ZReg)
-          .addReg(SrcReg1)
-          .addImm(Imm)
-          .addImm(0);
-    else
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(ZReg)
-          .addReg(SrcReg1)
-          .addReg(SrcReg2);
-  } else {
-    if (UseImm)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(SrcReg1);
-    else
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(SrcReg1)
-          .addReg(SrcReg2);
-  }
-  return true;
-}
+  ResultReg = createResultReg(&AArch64::GPR32RegClass);
 
-bool AArch64FastISel::SelectCmp(const Instruction *I) {
-  const CmpInst *CI = cast<CmpInst>(I);
+  // FCMP_UEQ and FCMP_ONE cannot be checked with a single instruction. These
+  // condition codes are inverted, because they are used by CSINC.
+  static unsigned CondCodeTable[2][2] = {
+    { AArch64CC::NE, AArch64CC::VC },
+    { AArch64CC::PL, AArch64CC::LE }
+  };
+  unsigned *CondCodes = nullptr;
+  switch (Predicate) {
+  default:
+    break;
+  case CmpInst::FCMP_UEQ:
+    CondCodes = &CondCodeTable[0][0];
+    break;
+  case CmpInst::FCMP_ONE:
+    CondCodes = &CondCodeTable[1][0];
+    break;
+  }
 
-  // We may not handle every CC for now.
-  AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
-  if (CC == AArch64CC::AL)
-    return false;
+  if (CondCodes) {
+    unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+            TmpReg1)
+        .addReg(AArch64::WZR, getKillRegState(true))
+        .addReg(AArch64::WZR, getKillRegState(true))
+        .addImm(CondCodes[0]);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+            ResultReg)
+        .addReg(TmpReg1, getKillRegState(true))
+        .addReg(AArch64::WZR, getKillRegState(true))
+        .addImm(CondCodes[1]);
 
-  // Emit the cmp.
-  if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
-    return false;
+    updateValueMap(I, ResultReg);
+    return true;
+  }
 
   // Now set a register based on the comparison.
+  AArch64CC::CondCode CC = getCompareCC(Predicate);
+  assert((CC != AArch64CC::AL) && "Unexpected condition code.");
   AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
-  unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
           ResultReg)
-      .addReg(AArch64::WZR)
-      .addReg(AArch64::WZR)
+      .addReg(AArch64::WZR, getKillRegState(true))
+      .addReg(AArch64::WZR, getKillRegState(true))
       .addImm(invertedCC);
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectSelect(const Instruction *I) {
-  const SelectInst *SI = cast<SelectInst>(I);
-
-  EVT DestEVT = TLI.getValueType(SI->getType(), true);
-  if (!DestEVT.isSimple())
+/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false'
+/// value.
+bool AArch64FastISel::optimizeSelect(const SelectInst *SI) {
+  if (!SI->getType()->isIntegerTy(1))
     return false;
 
-  MVT DestVT = DestEVT.getSimpleVT();
-  if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
-      DestVT != MVT::f64)
-    return false;
+  const Value *Src1Val, *Src2Val;
+  unsigned Opc = 0;
+  bool NeedExtraOp = false;
+  if (auto *CI = dyn_cast<ConstantInt>(SI->getTrueValue())) {
+    if (CI->isOne()) {
+      Src1Val = SI->getCondition();
+      Src2Val = SI->getFalseValue();
+      Opc = AArch64::ORRWrr;
+    } else {
+      assert(CI->isZero());
+      Src1Val = SI->getFalseValue();
+      Src2Val = SI->getCondition();
+      Opc = AArch64::BICWrr;
+    }
+  } else if (auto *CI = dyn_cast<ConstantInt>(SI->getFalseValue())) {
+    if (CI->isOne()) {
+      Src1Val = SI->getCondition();
+      Src2Val = SI->getTrueValue();
+      Opc = AArch64::ORRWrr;
+      NeedExtraOp = true;
+    } else {
+      assert(CI->isZero());
+      Src1Val = SI->getCondition();
+      Src2Val = SI->getTrueValue();
+      Opc = AArch64::ANDWrr;
+    }
+  }
 
-  unsigned CondReg = getRegForValue(SI->getCondition());
-  if (CondReg == 0)
-    return false;
-  unsigned TrueReg = getRegForValue(SI->getTrueValue());
-  if (TrueReg == 0)
+  if (!Opc)
     return false;
-  unsigned FalseReg = getRegForValue(SI->getFalseValue());
-  if (FalseReg == 0)
+
+  unsigned Src1Reg = getRegForValue(Src1Val);
+  if (!Src1Reg)
     return false;
+  bool Src1IsKill = hasTrivialKill(Src1Val);
 
+  unsigned Src2Reg = getRegForValue(Src2Val);
+  if (!Src2Reg)
+    return false;
+  bool Src2IsKill = hasTrivialKill(Src2Val);
 
-  MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
-  unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-          ANDReg)
-      .addReg(CondReg)
-      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+  if (NeedExtraOp) {
+    Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1);
+    Src1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32spRegClass, Src1Reg,
+                                       Src1IsKill, Src2Reg, Src2IsKill);
+  updateValueMap(SI, ResultReg);
+  return true;
+}
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri))
-      .addReg(ANDReg)
-      .addReg(ANDReg)
-      .addImm(0)
-      .addImm(0);
+bool AArch64FastISel::selectSelect(const Instruction *I) {
+  assert(isa<SelectInst>(I) && "Expected a select instruction.");
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT))
+    return false;
 
-  unsigned SelectOpc;
-  switch (DestVT.SimpleTy) {
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  switch (VT.SimpleTy) {
   default:
     return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
   case MVT::i32:
-    SelectOpc = AArch64::CSELWr;
+    Opc = AArch64::CSELWr;
+    RC = &AArch64::GPR32RegClass;
     break;
   case MVT::i64:
-    SelectOpc = AArch64::CSELXr;
+    Opc = AArch64::CSELXr;
+    RC = &AArch64::GPR64RegClass;
     break;
   case MVT::f32:
-    SelectOpc = AArch64::FCSELSrrr;
+    Opc = AArch64::FCSELSrrr;
+    RC = &AArch64::FPR32RegClass;
     break;
   case MVT::f64:
-    SelectOpc = AArch64::FCSELDrrr;
+    Opc = AArch64::FCSELDrrr;
+    RC = &AArch64::FPR64RegClass;
     break;
   }
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
-          ResultReg)
-      .addReg(TrueReg)
-      .addReg(FalseReg)
-      .addImm(AArch64CC::NE);
+  const SelectInst *SI = cast<SelectInst>(I);
+  const Value *Cond = SI->getCondition();
+  AArch64CC::CondCode CC = AArch64CC::NE;
+  AArch64CC::CondCode ExtraCC = AArch64CC::AL;
 
-  UpdateValueMap(I, ResultReg);
+  if (optimizeSelect(SI))
+    return true;
+
+  // Try to pickup the flags, so we don't have to emit another compare.
+  if (foldXALUIntrinsic(CC, I, Cond)) {
+    // Fake request the condition to force emission of the XALU intrinsic.
+    unsigned CondReg = getRegForValue(Cond);
+    if (!CondReg)
+      return false;
+  } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() &&
+             isValueAvailable(Cond)) {
+    const auto *Cmp = cast<CmpInst>(Cond);
+    // Try to optimize or fold the cmp.
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(Cmp);
+    const Value *FoldSelect = nullptr;
+    switch (Predicate) {
+    default:
+      break;
+    case CmpInst::FCMP_FALSE:
+      FoldSelect = SI->getFalseValue();
+      break;
+    case CmpInst::FCMP_TRUE:
+      FoldSelect = SI->getTrueValue();
+      break;
+    }
+
+    if (FoldSelect) {
+      unsigned SrcReg = getRegForValue(FoldSelect);
+      if (!SrcReg)
+        return false;
+      unsigned UseReg = lookUpRegForValue(SI);
+      if (UseReg)
+        MRI.clearKillFlags(UseReg);
+
+      updateValueMap(I, SrcReg);
+      return true;
+    }
+
+    // Emit the cmp.
+    if (!emitCmp(Cmp->getOperand(0), Cmp->getOperand(1), Cmp->isUnsigned()))
+      return false;
+
+    // FCMP_UEQ and FCMP_ONE cannot be checked with a single select instruction.
+    CC = getCompareCC(Predicate);
+    switch (Predicate) {
+    default:
+      break;
+    case CmpInst::FCMP_UEQ:
+      ExtraCC = AArch64CC::EQ;
+      CC = AArch64CC::VS;
+      break;
+    case CmpInst::FCMP_ONE:
+      ExtraCC = AArch64CC::MI;
+      CC = AArch64CC::GT;
+      break;
+    }
+    assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+  } else {
+    unsigned CondReg = getRegForValue(Cond);
+    if (!CondReg)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+
+    // Emit a TST instruction (ANDS wzr, reg, #imm).
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDSWri),
+            AArch64::WZR)
+        .addReg(CondReg, getKillRegState(CondIsKill))
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+  }
+
+  unsigned Src1Reg = getRegForValue(SI->getTrueValue());
+  bool Src1IsKill = hasTrivialKill(SI->getTrueValue());
+
+  unsigned Src2Reg = getRegForValue(SI->getFalseValue());
+  bool Src2IsKill = hasTrivialKill(SI->getFalseValue());
+
+  if (!Src1Reg || !Src2Reg)
+    return false;
+
+  if (ExtraCC != AArch64CC::AL) {
+    Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+                               Src2IsKill, ExtraCC);
+    Src2IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+                                        Src2IsKill, CC);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectFPExt(const Instruction *I) {
+bool AArch64FastISel::selectFPExt(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
     return false;
@@ -1094,11 +2714,11 @@ bool AArch64FastISel::SelectFPExt(const Instruction *I) {
   unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
           ResultReg).addReg(Op);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
+bool AArch64FastISel::selectFPTrunc(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
     return false;
@@ -1110,12 +2730,12 @@ bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
   unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
           ResultReg).addReg(Op);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
 // FPToUI and FPToSI
-bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
@@ -1144,11 +2764,11 @@ bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
       DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(SrcReg);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
@@ -1156,22 +2776,21 @@ bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
           "Unexpected value type.");
 
   unsigned SrcReg = getRegForValue(I->getOperand(0));
-  if (SrcReg == 0)
+  if (!SrcReg)
     return false;
+  bool SrcIsKill = hasTrivialKill(I->getOperand(0));
 
   EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
 
   // Handle sign-extension.
   if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
     SrcReg =
-        EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
-    if (SrcReg == 0)
+        emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+    if (!SrcReg)
       return false;
+    SrcIsKill = true;
   }
 
-  MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass
-                                                  : &AArch64::GPR32RegClass);
-
   unsigned Opc;
   if (SrcVT == MVT::i64) {
     if (Signed)
@@ -1185,21 +2804,128 @@ bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
       Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
   }
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg);
-  UpdateValueMap(I, ResultReg);
+  unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg,
+                                      SrcIsKill);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::fastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C)
+    return false;
+
+  // Only handle simple cases of up to 8 GPR and FPR each.
+  unsigned GPRCnt = 0;
+  unsigned FPRCnt = 0;
+  unsigned Idx = 0;
+  for (auto const &Arg : F->args()) {
+    // The first argument is at index 1.
+    ++Idx;
+    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = Arg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(ArgTy);
+    if (!ArgVT.isSimple())
+      return false;
+
+    MVT VT = ArgVT.getSimpleVT().SimpleTy;
+    if (VT.isFloatingPoint() && !Subtarget->hasFPARMv8())
+      return false;
+
+    if (VT.isVector() &&
+        (!Subtarget->hasNEON() || !Subtarget->isLittleEndian()))
+      return false;
+
+    if (VT >= MVT::i1 && VT <= MVT::i64)
+      ++GPRCnt;
+    else if ((VT >= MVT::f16 && VT <= MVT::f64) || VT.is64BitVector() ||
+             VT.is128BitVector())
+      ++FPRCnt;
+    else
+      return false;
+
+    if (GPRCnt > 8 || FPRCnt > 8)
+      return false;
+  }
+
+  static const MCPhysReg Registers[6][8] = {
+    { AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4,
+      AArch64::W5, AArch64::W6, AArch64::W7 },
+    { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4,
+      AArch64::X5, AArch64::X6, AArch64::X7 },
+    { AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4,
+      AArch64::H5, AArch64::H6, AArch64::H7 },
+    { AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4,
+      AArch64::S5, AArch64::S6, AArch64::S7 },
+    { AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4,
+      AArch64::D5, AArch64::D6, AArch64::D7 },
+    { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
+      AArch64::Q5, AArch64::Q6, AArch64::Q7 }
+  };
+
+  unsigned GPRIdx = 0;
+  unsigned FPRIdx = 0;
+  for (auto const &Arg : F->args()) {
+    MVT VT = TLI.getSimpleValueType(Arg.getType());
+    unsigned SrcReg;
+    const TargetRegisterClass *RC;
+    if (VT >= MVT::i1 && VT <= MVT::i32) {
+      SrcReg = Registers[0][GPRIdx++];
+      RC = &AArch64::GPR32RegClass;
+      VT = MVT::i32;
+    } else if (VT == MVT::i64) {
+      SrcReg = Registers[1][GPRIdx++];
+      RC = &AArch64::GPR64RegClass;
+    } else if (VT == MVT::f16) {
+      SrcReg = Registers[2][FPRIdx++];
+      RC = &AArch64::FPR16RegClass;
+    } else if (VT ==  MVT::f32) {
+      SrcReg = Registers[3][FPRIdx++];
+      RC = &AArch64::FPR32RegClass;
+    } else if ((VT == MVT::f64) || VT.is64BitVector()) {
+      SrcReg = Registers[4][FPRIdx++];
+      RC = &AArch64::FPR64RegClass;
+    } else if (VT.is128BitVector()) {
+      SrcReg = Registers[5][FPRIdx++];
+      RC = &AArch64::FPR128RegClass;
+    } else
+      llvm_unreachable("Unexpected value type.");
+
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(DstReg, getKillRegState(true));
+    updateValueMap(&Arg, ResultReg);
+  }
   return true;
 }
 
-bool AArch64FastISel::ProcessCallArgs(
-    SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs,
-    SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-    SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
-    unsigned &NumBytes) {
+bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
+                                      SmallVectorImpl<MVT> &OutVTs,
+                                      unsigned &NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
-  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+  CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
 
   // Get a count of how many bytes are to be pushed on the stack.
   NumBytes = CCInfo.getNextStackOffset();
@@ -1207,13 +2933,17 @@ bool AArch64FastISel::ProcessCallArgs(
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-      .addImm(NumBytes);
+    .addImm(NumBytes);
 
   // Process the args.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    unsigned Arg = ArgRegs[VA.getValNo()];
-    MVT ArgVT = ArgVTs[VA.getValNo()];
+    const Value *ArgVal = CLI.OutVals[VA.getValNo()];
+    MVT ArgVT = OutVTs[VA.getValNo()];
+
+    unsigned ArgReg = getRegForValue(ArgVal);
+    if (!ArgReg)
+      return false;
 
     // Handle arg promotion: SExt, ZExt, AExt.
     switch (VA.getLocInfo()) {
@@ -1222,8 +2952,8 @@ bool AArch64FastISel::ProcessCallArgs(
     case CCValAssign::SExt: {
       MVT DestVT = VA.getLocVT();
       MVT SrcVT = ArgVT;
-      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
-      if (Arg == 0)
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false);
+      if (!ArgReg)
         return false;
       break;
     }
@@ -1232,8 +2962,8 @@ bool AArch64FastISel::ProcessCallArgs(
     case CCValAssign::ZExt: {
       MVT DestVT = VA.getLocVT();
       MVT SrcVT = ArgVT;
-      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
-      if (Arg == 0)
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true);
+      if (!ArgReg)
         return false;
       break;
     }
@@ -1244,14 +2974,18 @@ bool AArch64FastISel::ProcessCallArgs(
     // Now copy/store arg to correct locations.
     if (VA.isRegLoc() && !VA.needsCustom()) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
-      RegArgs.push_back(VA.getLocReg());
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+      CLI.OutRegs.push_back(VA.getLocReg());
     } else if (VA.needsCustom()) {
       // FIXME: Handle custom args.
       return false;
     } else {
       assert(VA.isMemLoc() && "Assuming store on stack.");
 
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
       // Need to store on the stack.
       unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
 
@@ -1264,26 +2998,31 @@ bool AArch64FastISel::ProcessCallArgs(
       Addr.setReg(AArch64::SP);
       Addr.setOffset(VA.getLocMemOffset() + BEAlign);
 
-      if (!EmitStore(ArgVT, Arg, Addr))
+      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getStack(Addr.getOffset()),
+        MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+
+      if (!emitStore(ArgVT, ArgReg, Addr, MMO))
         return false;
     }
   }
   return true;
 }
 
-bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                                 const Instruction *I, CallingConv::ID CC,
-                                 unsigned &NumBytes) {
+bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
+                                 unsigned NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
-      .addImm(NumBytes)
-      .addImm(0);
+    .addImm(NumBytes).addImm(0);
 
   // Now the return value.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
 
     // Only handle a single return value.
@@ -1294,147 +3033,147 @@ bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
     MVT CopyVT = RVLocs[0].getValVT();
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY),
-            ResultReg).addReg(RVLocs[0].getLocReg());
-    UsedRegs.push_back(RVLocs[0].getLocReg());
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(RVLocs[0].getLocReg());
+    CLI.InRegs.push_back(RVLocs[0].getLocReg());
 
-    // Finally update the result.
-    UpdateValueMap(I, ResultReg);
+    CLI.ResultReg = ResultReg;
+    CLI.NumResultRegs = 1;
   }
 
   return true;
 }
 
-bool AArch64FastISel::SelectCall(const Instruction *I,
-                                 const char *IntrMemName = nullptr) {
-  const CallInst *CI = cast<CallInst>(I);
-  const Value *Callee = CI->getCalledValue();
+bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  CallingConv::ID CC  = CLI.CallConv;
+  bool IsTailCall     = CLI.IsTailCall;
+  bool IsVarArg       = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  const char *SymName = CLI.SymName;
 
-  // Don't handle inline asm or intrinsics.
-  if (isa<InlineAsm>(Callee))
+  if (!Callee && !SymName)
     return false;
 
-  // Only handle global variable Callees.
-  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV)
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
     return false;
 
-  // Check the calling convention.
-  ImmutableCallSite CS(CI);
-  CallingConv::ID CC = CS.getCallingConv();
+  CodeModel::Model CM = TM.getCodeModel();
+  // Only support the small and large code model.
+  if (CM != CodeModel::Small && CM != CodeModel::Large)
+    return false;
+
+  // FIXME: Add large code model support for ELF.
+  if (CM == CodeModel::Large && !Subtarget->isTargetMachO())
+    return false;
 
   // Let SDISel handle vararg functions.
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  if (FTy->isVarArg())
+  if (IsVarArg)
     return false;
 
-  // Handle *simple* calls for now.
+  // FIXME: Only handle *simple* calls for now.
   MVT RetVT;
-  Type *RetTy = I->getType();
-  if (RetTy->isVoidTy())
+  if (CLI.RetTy->isVoidTy())
     RetVT = MVT::isVoid;
-  else if (!isTypeLegal(RetTy, RetVT))
+  else if (!isTypeLegal(CLI.RetTy, RetVT))
     return false;
 
-  // Set up the argument vectors.
-  SmallVector<Value *, 8> Args;
-  SmallVector<unsigned, 8> ArgRegs;
-  SmallVector<MVT, 8> ArgVTs;
-  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
-  Args.reserve(CS.arg_size());
-  ArgRegs.reserve(CS.arg_size());
-  ArgVTs.reserve(CS.arg_size());
-  ArgFlags.reserve(CS.arg_size());
-
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
-    // If we're lowering a memory intrinsic instead of a regular call, skip the
-    // last two arguments, which shouldn't be passed to the underlying function.
-    if (IntrMemName && e - i <= 2)
-      break;
-
-    unsigned Arg = getRegForValue(*i);
-    if (Arg == 0)
+  for (auto Flag : CLI.OutFlags)
+    if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal())
       return false;
 
-    ISD::ArgFlagsTy Flags;
-    unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
-      Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
-      Flags.setZExt();
-
-    // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
-        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
-        CS.paramHasAttr(AttrInd, Attribute::ByVal))
-      return false;
+  // Set up the argument vectors.
+  SmallVector<MVT, 16> OutVTs;
+  OutVTs.reserve(CLI.OutVals.size());
 
-    MVT ArgVT;
-    Type *ArgTy = (*i)->getType();
-    if (!isTypeLegal(ArgTy, ArgVT) &&
-        !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
+  for (auto *Val : CLI.OutVals) {
+    MVT VT;
+    if (!isTypeLegal(Val->getType(), VT) &&
+        !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16))
       return false;
 
     // We don't handle vector parameters yet.
-    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+    if (VT.isVector() || VT.getSizeInBits() > 64)
       return false;
 
-    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
-    Flags.setOrigAlign(OriginalAlignment);
-
-    Args.push_back(*i);
-    ArgRegs.push_back(Arg);
-    ArgVTs.push_back(ArgVT);
-    ArgFlags.push_back(Flags);
+    OutVTs.push_back(VT);
   }
 
+  Address Addr;
+  if (Callee && !computeCallAddress(Callee, Addr))
+    return false;
+
   // Handle the arguments now that we've gotten them.
-  SmallVector<unsigned, 4> RegArgs;
   unsigned NumBytes;
-  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+  if (!processCallArgs(CLI, OutVTs, NumBytes))
     return false;
 
   // Issue the call.
   MachineInstrBuilder MIB;
-  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL));
-  if (!IntrMemName)
-    MIB.addGlobalAddress(GV, 0, 0);
-  else
-    MIB.addExternalSymbol(IntrMemName, 0);
+  if (CM == CodeModel::Small) {
+    const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
+    if (SymName)
+      MIB.addExternalSymbol(SymName, 0);
+    else if (Addr.getGlobalValue())
+      MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0);
+    else if (Addr.getReg()) {
+      unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0);
+      MIB.addReg(Reg);
+    } else
+      return false;
+  } else {
+    unsigned CallReg = 0;
+    if (SymName) {
+      unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+              ADRPReg)
+        .addExternalSymbol(SymName, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+      CallReg = createResultReg(&AArch64::GPR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+              CallReg)
+        .addReg(ADRPReg)
+        .addExternalSymbol(SymName, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                           AArch64II::MO_NC);
+    } else if (Addr.getGlobalValue())
+      CallReg = materializeGV(Addr.getGlobalValue());
+    else if (Addr.getReg())
+      CallReg = Addr.getReg();
+
+    if (!CallReg)
+      return false;
+
+    const MCInstrDesc &II = TII.get(AArch64::BLR);
+    CallReg = constrainOperandRegClass(II, CallReg, 0);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg);
+  }
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (auto Reg : CLI.OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
-  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
-
-  // Finish off the call including any return values.
-  SmallVector<unsigned, 4> UsedRegs;
-  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
-    return false;
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
 
-  // Set all unused physreg defs as dead.
-  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+  CLI.Call = MIB;
 
-  return true;
+  // Finish off the call including any return values.
+  return finishCall(CLI, RetVT, NumBytes);
 }
 
-bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+bool AArch64FastISel::isMemCpySmall(uint64_t Len, unsigned Alignment) {
   if (Alignment)
     return Len / Alignment <= 4;
   else
     return Len < 32;
 }
 
-bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
+bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src,
                                          uint64_t Len, unsigned Alignment) {
   // Make sure we don't bloat code by inlining very large memcpy's.
-  if (!IsMemCpySmall(Len, Alignment))
+  if (!isMemCpySmall(Len, Alignment))
     return false;
 
   int64_t UnscaledOffset = 0;
@@ -1464,14 +3203,11 @@ bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
       }
     }
 
-    bool RV;
-    unsigned ResultReg;
-    RV = EmitLoad(VT, ResultReg, Src);
-    if (!RV)
+    unsigned ResultReg = emitLoad(VT, VT, Src);
+    if (!ResultReg)
       return false;
 
-    RV = EmitStore(VT, ResultReg, Dest);
-    if (!RV)
+    if (!emitStore(VT, ResultReg, Dest))
       return false;
 
     int64_t Size = VT.getSizeInBits() / 8;
@@ -1486,73 +3222,430 @@ bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
   return true;
 }
 
-bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
-  // FIXME: Handle more intrinsics.
-  switch (I.getIntrinsicID()) {
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
+                                        const Instruction *I,
+                                        const Value *Cond) {
+  if (!isa<ExtractValueInst>(Cond))
+    return false;
+
+  const auto *EV = cast<ExtractValueInst>(Cond);
+  if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+    return false;
+
+  const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+  MVT RetVT;
+  const Function *Callee = II->getCalledFunction();
+  Type *RetTy =
+  cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+  if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return false;
+
+  const Value *LHS = II->getArgOperand(0);
+  const Value *RHS = II->getArgOperand(1);
+
+  // Canonicalize immediate to the RHS.
+  if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+      isCommutativeIntrinsic(II))
+    std::swap(LHS, RHS);
+
+  // Simplify multiplies.
+  unsigned IID = II->getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::smul_with_overflow:
+    if (const auto *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue() == 2)
+        IID = Intrinsic::sadd_with_overflow;
+    break;
+  case Intrinsic::umul_with_overflow:
+    if (const auto *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue() == 2)
+        IID = Intrinsic::uadd_with_overflow;
+    break;
+  }
+
+  AArch64CC::CondCode TmpCC;
+  switch (IID) {
   default:
     return false;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+    TmpCC = AArch64CC::VS;
+    break;
+  case Intrinsic::uadd_with_overflow:
+    TmpCC = AArch64CC::HS;
+    break;
+  case Intrinsic::usub_with_overflow:
+    TmpCC = AArch64CC::LO;
+    break;
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    TmpCC = AArch64CC::NE;
+    break;
+  }
+
+  // Check if both instructions are in the same basic block.
+  if (!isValueAvailable(II))
+    return false;
+
+  // Make sure nothing is in the way
+  BasicBlock::const_iterator Start = I;
+  BasicBlock::const_iterator End = II;
+  for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+    // We only expect extractvalue instructions between the intrinsic and the
+    // instruction to be selected.
+    if (!isa<ExtractValueInst>(Itr))
+      return false;
+
+    // Check that the extractvalue operand comes from the intrinsic.
+    const auto *EVI = cast<ExtractValueInst>(Itr);
+    if (EVI->getAggregateOperand() != II)
+      return false;
+  }
+
+  CC = TmpCC;
+  return true;
+}
+
+bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+  // FIXME: Handle more intrinsics.
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::frameaddress: {
+    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    const AArch64RegisterInfo *RegInfo =
+        static_cast<const AArch64RegisterInfo *>(
+            TM.getSubtargetImpl()->getRegisterInfo());
+    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr);
+    // Recursively load frame address
+    // ldr x0, [fp]
+    // ldr x0, [x0]
+    // ldr x0, [x0]
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass,
+                                SrcReg, /*IsKill=*/true, 0);
+      assert(DestReg && "Unexpected LDR instruction emission failure.");
+      SrcReg = DestReg;
+    }
+
+    updateValueMap(II, SrcReg);
+    return true;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
-    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    const auto *MTI = cast<MemTransferInst>(II);
     // Don't handle volatile.
-    if (MTI.isVolatile())
+    if (MTI->isVolatile())
       return false;
 
     // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
     // we would emit dead code because we don't currently handle memmoves.
-    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
-    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+    bool IsMemCpy = (II->getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI->getLength()) && IsMemCpy) {
       // Small memcpy's are common enough that we want to do them without a call
       // if possible.
-      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
-      unsigned Alignment = MTI.getAlignment();
-      if (IsMemCpySmall(Len, Alignment)) {
+      uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue();
+      unsigned Alignment = MTI->getAlignment();
+      if (isMemCpySmall(Len, Alignment)) {
         Address Dest, Src;
-        if (!ComputeAddress(MTI.getRawDest(), Dest) ||
-            !ComputeAddress(MTI.getRawSource(), Src))
+        if (!computeAddress(MTI->getRawDest(), Dest) ||
+            !computeAddress(MTI->getRawSource(), Src))
           return false;
-        if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+        if (tryEmitSmallMemCpy(Dest, Src, Len, Alignment))
           return true;
       }
     }
 
-    if (!MTI.getLength()->getType()->isIntegerTy(64))
+    if (!MTI->getLength()->getType()->isIntegerTy(64))
       return false;
 
-    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+    if (MTI->getSourceAddressSpace() > 255 || MTI->getDestAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
       return false;
 
-    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
-    return SelectCall(&I, IntrMemName);
+    const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
+    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
   }
   case Intrinsic::memset: {
-    const MemSetInst &MSI = cast<MemSetInst>(I);
+    const MemSetInst *MSI = cast<MemSetInst>(II);
     // Don't handle volatile.
-    if (MSI.isVolatile())
+    if (MSI->isVolatile())
       return false;
 
-    if (!MSI.getLength()->getType()->isIntegerTy(64))
+    if (!MSI->getLength()->getType()->isIntegerTy(64))
       return false;
 
-    if (MSI.getDestAddressSpace() > 255)
+    if (MSI->getDestAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
       return false;
 
-    return SelectCall(&I, "memset");
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::pow: {
+    MVT RetVT;
+    if (!isTypeLegal(II->getType(), RetVT))
+      return false;
+
+    if (RetVT != MVT::f32 && RetVT != MVT::f64)
+      return false;
+
+    static const RTLIB::Libcall LibCallTable[3][2] = {
+      { RTLIB::SIN_F32, RTLIB::SIN_F64 },
+      { RTLIB::COS_F32, RTLIB::COS_F64 },
+      { RTLIB::POW_F32, RTLIB::POW_F64 }
+    };
+    RTLIB::Libcall LC;
+    bool Is64Bit = RetVT == MVT::f64;
+    switch (II->getIntrinsicID()) {
+    default:
+      llvm_unreachable("Unexpected intrinsic.");
+    case Intrinsic::sin:
+      LC = LibCallTable[0][Is64Bit];
+      break;
+    case Intrinsic::cos:
+      LC = LibCallTable[1][Is64Bit];
+      break;
+    case Intrinsic::pow:
+      LC = LibCallTable[2][Is64Bit];
+      break;
+    }
+
+    ArgListTy Args;
+    Args.reserve(II->getNumArgOperands());
+
+    // Populate the argument list.
+    for (auto &Arg : II->arg_operands()) {
+      ArgListEntry Entry;
+      Entry.Val = Arg;
+      Entry.Ty = Arg->getType();
+      Args.push_back(Entry);
+    }
+
+    CallLoweringInfo CLI;
+    CLI.setCallee(TLI.getLibcallCallingConv(LC), II->getType(),
+                  TLI.getLibcallName(LC), std::move(Args));
+    if (!lowerCallTo(CLI))
+      return false;
+    updateValueMap(II, CLI.ResultReg);
+    return true;
+  }
+  case Intrinsic::fabs: {
+    MVT VT;
+    if (!isTypeLegal(II->getType(), VT))
+      return false;
+
+    unsigned Opc;
+    switch (VT.SimpleTy) {
+    default:
+      return false;
+    case MVT::f32:
+      Opc = AArch64::FABSSr;
+      break;
+    case MVT::f64:
+      Opc = AArch64::FABSDr;
+      break;
+    }
+    unsigned SrcReg = getRegForValue(II->getOperand(0));
+    if (!SrcReg)
+      return false;
+    bool SrcRegIsKill = hasTrivialKill(II->getOperand(0));
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg, getKillRegState(SrcRegIsKill));
+    updateValueMap(II, ResultReg);
+    return true;
   }
   case Intrinsic::trap: {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
         .addImm(1);
     return true;
   }
+  case Intrinsic::sqrt: {
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    unsigned Op0Reg = getRegForValue(II->getOperand(0));
+    if (!Op0Reg)
+      return false;
+    bool Op0IsKill = hasTrivialKill(II->getOperand(0));
+
+    unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill);
+    if (!ResultReg)
+      return false;
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    // This implements the basic lowering of the xalu with overflow intrinsics.
+    const Function *Callee = II->getCalledFunction();
+    auto *Ty = cast<StructType>(Callee->getReturnType());
+    Type *RetTy = Ty->getTypeAtIndex(0U);
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    if (VT != MVT::i32 && VT != MVT::i64)
+      return false;
+
+    const Value *LHS = II->getArgOperand(0);
+    const Value *RHS = II->getArgOperand(1);
+    // Canonicalize immediate to the RHS.
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+        isCommutativeIntrinsic(II))
+      std::swap(LHS, RHS);
+
+    // Simplify multiplies.
+    unsigned IID = II->getIntrinsicID();
+    switch (IID) {
+    default:
+      break;
+    case Intrinsic::smul_with_overflow:
+      if (const auto *C = dyn_cast<ConstantInt>(RHS))
+        if (C->getValue() == 2) {
+          IID = Intrinsic::sadd_with_overflow;
+          RHS = LHS;
+        }
+      break;
+    case Intrinsic::umul_with_overflow:
+      if (const auto *C = dyn_cast<ConstantInt>(RHS))
+        if (C->getValue() == 2) {
+          IID = Intrinsic::uadd_with_overflow;
+          RHS = LHS;
+        }
+      break;
+    }
+
+    unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0;
+    AArch64CC::CondCode CC = AArch64CC::Invalid;
+    switch (IID) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::sadd_with_overflow:
+      ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::VS;
+      break;
+    case Intrinsic::uadd_with_overflow:
+      ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::HS;
+      break;
+    case Intrinsic::ssub_with_overflow:
+      ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::VS;
+      break;
+    case Intrinsic::usub_with_overflow:
+      ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::LO;
+      break;
+    case Intrinsic::smul_with_overflow: {
+      CC = AArch64CC::NE;
+      unsigned LHSReg = getRegForValue(LHS);
+      if (!LHSReg)
+        return false;
+      bool LHSIsKill = hasTrivialKill(LHS);
+
+      unsigned RHSReg = getRegForValue(RHS);
+      if (!RHSReg)
+        return false;
+      bool RHSIsKill = hasTrivialKill(RHS);
+
+      if (VT == MVT::i32) {
+        MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg,
+                                       /*IsKill=*/false, 32);
+        MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+                                            AArch64::sub_32);
+        ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true,
+                                              AArch64::sub_32);
+        emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+                    AArch64_AM::ASR, 31, /*WantResult=*/false);
+      } else {
+        assert(VT == MVT::i64 && "Unexpected value type.");
+        MulReg = emitMul_rr(VT, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill,
+                                        RHSReg, RHSIsKill);
+        emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+                    AArch64_AM::ASR, 63, /*WantResult=*/false);
+      }
+      break;
+    }
+    case Intrinsic::umul_with_overflow: {
+      CC = AArch64CC::NE;
+      unsigned LHSReg = getRegForValue(LHS);
+      if (!LHSReg)
+        return false;
+      bool LHSIsKill = hasTrivialKill(LHS);
+
+      unsigned RHSReg = getRegForValue(RHS);
+      if (!RHSReg)
+        return false;
+      bool RHSIsKill = hasTrivialKill(RHS);
+
+      if (VT == MVT::i32) {
+        MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg,
+                    /*IsKill=*/false, AArch64_AM::LSR, 32,
+                    /*WantResult=*/false);
+        MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+                                            AArch64::sub_32);
+      } else {
+        assert(VT == MVT::i64 && "Unexpected value type.");
+        MulReg = emitMul_rr(VT, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill,
+                                        RHSReg, RHSIsKill);
+        emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg,
+                    /*IsKill=*/false, /*WantResult=*/false);
+      }
+      break;
+    }
+    }
+
+    if (MulReg) {
+      ResultReg1 = createResultReg(TLI.getRegClassFor(VT));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg1).addReg(MulReg);
+    }
+
+    ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass,
+                                  AArch64::WZR, /*IsKill=*/true, AArch64::WZR,
+                                  /*IsKill=*/true, getInvertedCondCode(CC));
+    (void)ResultReg2;
+    assert((ResultReg1 + 1) == ResultReg2 &&
+           "Nonconsecutive result registers.");
+    updateValueMap(II, ResultReg1, 2);
+    return true;
+  }
   }
   return false;
 }
 
-bool AArch64FastISel::SelectRet(const Instruction *I) {
+bool AArch64FastISel::selectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
 
@@ -1572,8 +3665,7 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
-                   I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
     CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
                                                      : RetCC_AArch64_AAPCS;
     CCInfo.AnalyzeReturn(Outs, RetCC);
@@ -1586,11 +3678,14 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
     const Value *RV = Ret->getOperand(0);
 
     // Don't bother handling odd stuff for now.
-    if (VA.getLocInfo() != CCValAssign::Full)
+    if ((VA.getLocInfo() != CCValAssign::Full) &&
+        (VA.getLocInfo() != CCValAssign::BCvt))
       return false;
+
     // Only handle register returns for now.
     if (!VA.isRegLoc())
       return false;
+
     unsigned Reg = getRegForValue(RV);
     if (Reg == 0)
       return false;
@@ -1606,12 +3701,14 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
       return false;
 
     // Vectors (of > 1 lane) in big endian need tricky handling.
-    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1)
+    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 &&
+        !Subtarget->isLittleEndian())
       return false;
 
     MVT RVVT = RVEVT.getSimpleVT();
     if (RVVT == MVT::f128)
       return false;
+
     MVT DestVT = VA.getValVT();
     // Special handling for extended integers.
     if (RVVT != DestVT) {
@@ -1621,8 +3718,8 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
       if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
         return false;
 
-      bool isZExt = Outs[0].Flags.isZExt();
-      SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
+      bool IsZExt = Outs[0].Flags.isZExt();
+      SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt);
       if (SrcReg == 0)
         return false;
     }
@@ -1642,7 +3739,7 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
   return true;
 }
 
-bool AArch64FastISel::SelectTrunc(const Instruction *I) {
+bool AArch64FastISel::selectTrunc(const Instruction *I) {
   Type *DestTy = I->getType();
   Value *Op = I->getOperand(0);
   Type *SrcTy = Op->getType();
@@ -1667,10 +3764,14 @@ bool AArch64FastISel::SelectTrunc(const Instruction *I) {
   unsigned SrcReg = getRegForValue(Op);
   if (!SrcReg)
     return false;
+  bool SrcIsKill = hasTrivialKill(Op);
 
   // If we're truncating from i64 to a smaller non-legal type then generate an
-  // AND.  Otherwise, we know the high bits are undefined and a truncate doesn't
-  // generate any code.
+  // AND. Otherwise, we know the high bits are undefined and a truncate only
+  // generate a COPY. We cannot mark the source register also as result
+  // register, because this can incorrectly transfer the kill flag onto the
+  // source register.
+  unsigned ResultReg;
   if (SrcVT == MVT::i64) {
     uint64_t Mask = 0;
     switch (DestVT.SimpleTy) {
@@ -1688,23 +3789,23 @@ bool AArch64FastISel::SelectTrunc(const Instruction *I) {
       break;
     }
     // Issue an extract_subreg to get the lower 32-bits.
-    unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
+    unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
                                                 AArch64::sub_32);
-    MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass);
     // Create the AND instruction which performs the actual truncation.
-    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-            ANDReg)
-        .addReg(Reg32)
-        .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32));
-    SrcReg = ANDReg;
+    ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask);
+    assert(ResultReg && "Unexpected AND instruction emission failure.");
+  } else {
+    ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SrcReg, getKillRegState(SrcIsKill));
   }
 
-  UpdateValueMap(I, SrcReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) {
   assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
           DestVT == MVT::i64) &&
          "Unexpected value type.");
@@ -1712,14 +3813,9 @@ unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
   if (DestVT == MVT::i8 || DestVT == MVT::i16)
     DestVT = MVT::i32;
 
-  if (isZExt) {
-    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
-    unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-            ResultReg)
-        .addReg(SrcReg)
-        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
-
+  if (IsZExt) {
+    unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+    assert(ResultReg && "Unexpected AND instruction emission failure.");
     if (DestVT == MVT::i64) {
       // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
       // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
@@ -1737,18 +3833,389 @@ unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
       // FIXME: We're SExt i1 to i64.
       return 0;
     }
-    unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri),
-            ResultReg)
-        .addReg(SrcReg)
+    return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg,
+                            /*TODO:IsKill=*/false, 0, 0);
+  }
+}
+
+unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  unsigned Opc, ZReg;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    RetVT = MVT::i32;
+    Opc = AArch64::MADDWrrr; ZReg = AArch64::WZR; break;
+  case MVT::i64:
+    Opc = AArch64::MADDXrrr; ZReg = AArch64::XZR; break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill,
+                          /*IsKill=*/ZReg, true);
+}
+
+unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                                        unsigned Op1, bool Op1IsKill) {
+  if (RetVT != MVT::i64)
+    return 0;
+
+  return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass,
+                          Op0, Op0IsKill, Op1, Op1IsKill,
+                          AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                                        unsigned Op1, bool Op1IsKill) {
+  if (RetVT != MVT::i64)
+    return 0;
+
+  return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass,
+                          Op0, Op0IsKill, Op1, Op1IsKill,
+                          AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                                     unsigned Op1Reg, bool Op1IsKill) {
+  unsigned Opc = 0;
+  bool NeedTrunc = false;
+  uint64_t Mask = 0;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:  Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xff;   break;
+  case MVT::i16: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xffff; break;
+  case MVT::i32: Opc = AArch64::LSLVWr;                                  break;
+  case MVT::i64: Opc = AArch64::LSLVXr;                                  break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  if (NeedTrunc) {
+    Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+    Op1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+                                       Op1IsKill);
+  if (NeedTrunc)
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+                                     bool Op0IsKill, uint64_t Shift,
+                                     bool IsZExt) {
+  assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+         "Unexpected source/return type pair.");
+  assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+          SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+         "Unexpected source value type.");
+  assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+          RetVT == MVT::i64) && "Unexpected return value type.");
+
+  bool Is64Bit = (RetVT == MVT::i64);
+  unsigned RegSize = Is64Bit ? 64 : 32;
+  unsigned DstBits = RetVT.getSizeInBits();
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+  // Just emit a copy for "zero" shifts.
+  if (Shift == 0) {
+    if (RetVT == SrcVT) {
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+          .addReg(Op0, getKillRegState(Op0IsKill));
+      return ResultReg;
+    } else
+      return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+  }
+
+  // Don't deal with undefined shifts.
+  if (Shift >= DstBits)
+    return 0;
+
+  // For immediate shifts we can fold the zero-/sign-extension into the shift.
+  // {S|U}BFM Wd, Wn, #r, #s
+  // Wd<32+s-r,32-r> = Wn<s:0> when r > s
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = shl i16 %1, 4
+  // Wd<32+7-28,32-28> = Wn<7:0> <- clamp s to 7
+  // 0b1111_1111_1111_1111__1111_1010_1010_0000 sext
+  // 0b0000_0000_0000_0000__0000_0101_0101_0000 sext | zext
+  // 0b0000_0000_0000_0000__0000_1010_1010_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = shl i16 %1, 8
+  // Wd<32+7-24,32-24> = Wn<7:0>
+  // 0b1111_1111_1111_1111__1010_1010_0000_0000 sext
+  // 0b0000_0000_0000_0000__0101_0101_0000_0000 sext | zext
+  // 0b0000_0000_0000_0000__1010_1010_0000_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = shl i16 %1, 12
+  // Wd<32+3-20,32-20> = Wn<3:0>
+  // 0b1111_1111_1111_1111__1010_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0101_0000_0000_0000 sext | zext
+  // 0b0000_0000_0000_0000__1010_0000_0000_0000 zext
+
+  unsigned ImmR = RegSize - Shift;
+  // Limit the width to the length of the source type.
+  unsigned ImmS = std::min<unsigned>(SrcBits - 1, DstBits - 1 - Shift);
+  static const unsigned OpcTable[2][2] = {
+    {AArch64::SBFMWri, AArch64::SBFMXri},
+    {AArch64::UBFMWri, AArch64::UBFMXri}
+  };
+  unsigned Opc = OpcTable[IsZExt][Is64Bit];
+  if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), TmpReg)
         .addImm(0)
-        .addImm(0);
-    return ResultReg;
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(AArch64::sub_32);
+    Op0 = TmpReg;
+    Op0IsKill = true;
   }
+  return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
 }
 
-unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
-                                     bool isZExt) {
+unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                                     unsigned Op1Reg, bool Op1IsKill) {
+  unsigned Opc = 0;
+  bool NeedTrunc = false;
+  uint64_t Mask = 0;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:  Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xff;   break;
+  case MVT::i16: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xffff; break;
+  case MVT::i32: Opc = AArch64::LSRVWr; break;
+  case MVT::i64: Opc = AArch64::LSRVXr; break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  if (NeedTrunc) {
+    Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask);
+    Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+    Op0IsKill = Op1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+                                       Op1IsKill);
+  if (NeedTrunc)
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+                                     bool Op0IsKill, uint64_t Shift,
+                                     bool IsZExt) {
+  assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+         "Unexpected source/return type pair.");
+  assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+          SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+         "Unexpected source value type.");
+  assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+          RetVT == MVT::i64) && "Unexpected return value type.");
+
+  bool Is64Bit = (RetVT == MVT::i64);
+  unsigned RegSize = Is64Bit ? 64 : 32;
+  unsigned DstBits = RetVT.getSizeInBits();
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+  // Just emit a copy for "zero" shifts.
+  if (Shift == 0) {
+    if (RetVT == SrcVT) {
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(Op0, getKillRegState(Op0IsKill));
+      return ResultReg;
+    } else
+      return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+  }
+
+  // Don't deal with undefined shifts.
+  if (Shift >= DstBits)
+    return 0;
+
+  // For immediate shifts we can fold the zero-/sign-extension into the shift.
+  // {S|U}BFM Wd, Wn, #r, #s
+  // Wd<s-r:0> = Wn<s:r> when r <= s
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = lshr i16 %1, 4
+  // Wd<7-4:0> = Wn<7:4>
+  // 0b0000_0000_0000_0000__0000_1111_1111_1010 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+  // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = lshr i16 %1, 8
+  // Wd<7-7,0> = Wn<7:7>
+  // 0b0000_0000_0000_0000__0000_0000_1111_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = lshr i16 %1, 12
+  // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+  // 0b0000_0000_0000_0000__0000_0000_0000_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  if (Shift >= SrcBits && IsZExt)
+    return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+  // It is not possible to fold a sign-extend into the LShr instruction. In this
+  // case emit a sign-extend.
+  if (!IsZExt) {
+    Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+    if (!Op0)
+      return 0;
+    Op0IsKill = true;
+    SrcVT = RetVT;
+    SrcBits = SrcVT.getSizeInBits();
+    IsZExt = true;
+  }
+
+  unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+  unsigned ImmS = SrcBits - 1;
+  static const unsigned OpcTable[2][2] = {
+    {AArch64::SBFMWri, AArch64::SBFMXri},
+    {AArch64::UBFMWri, AArch64::UBFMXri}
+  };
+  unsigned Opc = OpcTable[IsZExt][Is64Bit];
+  if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+        .addImm(0)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(AArch64::sub_32);
+    Op0 = TmpReg;
+    Op0IsKill = true;
+  }
+  return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                                     unsigned Op1Reg, bool Op1IsKill) {
+  unsigned Opc = 0;
+  bool NeedTrunc = false;
+  uint64_t Mask = 0;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:  Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xff;   break;
+  case MVT::i16: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xffff; break;
+  case MVT::i32: Opc = AArch64::ASRVWr;                                  break;
+  case MVT::i64: Opc = AArch64::ASRVXr;                                  break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  if (NeedTrunc) {
+    Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false);
+    Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+    Op0IsKill = Op1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+                                       Op1IsKill);
+  if (NeedTrunc)
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+                                     bool Op0IsKill, uint64_t Shift,
+                                     bool IsZExt) {
+  assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+         "Unexpected source/return type pair.");
+  assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+          SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+         "Unexpected source value type.");
+  assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+          RetVT == MVT::i64) && "Unexpected return value type.");
+
+  bool Is64Bit = (RetVT == MVT::i64);
+  unsigned RegSize = Is64Bit ? 64 : 32;
+  unsigned DstBits = RetVT.getSizeInBits();
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+  // Just emit a copy for "zero" shifts.
+  if (Shift == 0) {
+    if (RetVT == SrcVT) {
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(Op0, getKillRegState(Op0IsKill));
+      return ResultReg;
+    } else
+      return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+  }
+
+  // Don't deal with undefined shifts.
+  if (Shift >= DstBits)
+    return 0;
+
+  // For immediate shifts we can fold the zero-/sign-extension into the shift.
+  // {S|U}BFM Wd, Wn, #r, #s
+  // Wd<s-r:0> = Wn<s:r> when r <= s
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = ashr i16 %1, 4
+  // Wd<7-4:0> = Wn<7:4>
+  // 0b1111_1111_1111_1111__1111_1111_1111_1010 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+  // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = ashr i16 %1, 8
+  // Wd<7-7,0> = Wn<7:7>
+  // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = ashr i16 %1, 12
+  // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+  // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  if (Shift >= SrcBits && IsZExt)
+    return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+  unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+  unsigned ImmS = SrcBits - 1;
+  static const unsigned OpcTable[2][2] = {
+    {AArch64::SBFMWri, AArch64::SBFMXri},
+    {AArch64::UBFMWri, AArch64::UBFMXri}
+  };
+  unsigned Opc = OpcTable[IsZExt][Is64Bit];
+  if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+        .addImm(0)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(AArch64::sub_32);
+    Op0 = TmpReg;
+    Op0IsKill = true;
+  }
+  return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                     bool IsZExt) {
   assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
 
   // FastISel does not have plumbing to deal with extensions where the SrcVT or
@@ -1768,24 +4235,24 @@ unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
   default:
     return 0;
   case MVT::i1:
-    return Emiti1Ext(SrcReg, DestVT, isZExt);
+    return emiti1Ext(SrcReg, DestVT, IsZExt);
   case MVT::i8:
     if (DestVT == MVT::i64)
-      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+      Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
-      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+      Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 7;
     break;
   case MVT::i16:
     if (DestVT == MVT::i64)
-      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+      Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
-      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+      Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 15;
     break;
   case MVT::i32:
     assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
-    Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     Imm = 31;
     break;
   }
@@ -1803,45 +4270,167 @@ unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
     SrcReg = Src64;
   }
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg)
-      .addImm(0)
-      .addImm(Imm);
+  const TargetRegisterClass *RC =
+      (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm);
+}
 
-  return ResultReg;
+static bool isZExtLoad(const MachineInstr *LI) {
+  switch (LI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDURBBi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURWi:
+  case AArch64::LDRBBui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRWui:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRBBroW:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRWroW:
+    return true;
+  }
 }
 
-bool AArch64FastISel::SelectIntExt(const Instruction *I) {
-  // On ARM, in general, integer casts don't involve legal types; this code
-  // handles promotable integers.  The high bits for a type smaller than
-  // the register size are assumed to be undefined.
-  Type *DestTy = I->getType();
-  Value *Src = I->getOperand(0);
-  Type *SrcTy = Src->getType();
+static bool isSExtLoad(const MachineInstr *LI) {
+  switch (LI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSWroW:
+    return true;
+  }
+}
 
-  bool isZExt = isa<ZExtInst>(I);
-  unsigned SrcReg = getRegForValue(Src);
-  if (!SrcReg)
+bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
+                                         MVT SrcVT) {
+  const auto *LI = dyn_cast<LoadInst>(I->getOperand(0));
+  if (!LI || !LI->hasOneUse())
     return false;
 
-  EVT SrcEVT = TLI.getValueType(SrcTy, true);
-  EVT DestEVT = TLI.getValueType(DestTy, true);
-  if (!SrcEVT.isSimple())
+  // Check if the load instruction has already been selected.
+  unsigned Reg = lookUpRegForValue(LI);
+  if (!Reg)
     return false;
-  if (!DestEVT.isSimple())
+
+  MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+  if (!MI)
     return false;
 
-  MVT SrcVT = SrcEVT.getSimpleVT();
-  MVT DestVT = DestEVT.getSimpleVT();
-  unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
-  if (ResultReg == 0)
+  // Check if the correct load instruction has been emitted - SelectionDAG might
+  // have emitted a zero-extending load, but we need a sign-extending load.
+  bool IsZExt = isa<ZExtInst>(I);
+  const auto *LoadMI = MI;
+  if (LoadMI->getOpcode() == TargetOpcode::COPY &&
+      LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) {
+    unsigned LoadReg = MI->getOperand(1).getReg();
+    LoadMI = MRI.getUniqueVRegDef(LoadReg);
+    assert(LoadMI && "Expected valid instruction");
+  }
+  if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI)))
+    return false;
+
+  // Nothing to be done.
+  if (RetVT != MVT::i64 || SrcVT > MVT::i32) {
+    updateValueMap(I, Reg);
+    return true;
+  }
+
+  if (IsZExt) {
+    unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Reg64)
+        .addImm(0)
+        .addReg(Reg, getKillRegState(true))
+        .addImm(AArch64::sub_32);
+    Reg = Reg64;
+  } else {
+    assert((MI->getOpcode() == TargetOpcode::COPY &&
+            MI->getOperand(1).getSubReg() == AArch64::sub_32) &&
+           "Expected copy instruction");
+    Reg = MI->getOperand(1).getReg();
+    MI->eraseFromParent();
+  }
+  updateValueMap(I, Reg);
+  return true;
+}
+
+bool AArch64FastISel::selectIntExt(const Instruction *I) {
+  assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+         "Unexpected integer extend instruction.");
+  MVT RetVT;
+  MVT SrcVT;
+  if (!isTypeSupported(I->getType(), RetVT))
+    return false;
+
+  if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT))
+    return false;
+
+  // Try to optimize already sign-/zero-extended values from load instructions.
+  if (optimizeIntExtLoad(I, RetVT, SrcVT))
+    return true;
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (!SrcReg)
+    return false;
+  bool SrcIsKill = hasTrivialKill(I->getOperand(0));
+
+  // Try to optimize already sign-/zero-extended values from function arguments.
+  bool IsZExt = isa<ZExtInst>(I);
+  if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) {
+    if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) {
+      if (RetVT == MVT::i64 && SrcVT != MVT::i64) {
+        unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(AArch64::SUBREG_TO_REG), ResultReg)
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(SrcIsKill))
+            .addImm(AArch64::sub_32);
+        SrcReg = ResultReg;
+      }
+      // Conservatively clear all kill flags from all uses, because we are
+      // replacing a sign-/zero-extend instruction at IR level with a nop at MI
+      // level. The result of the instruction at IR level might have been
+      // trivially dead, which is now not longer true.
+      unsigned UseReg = lookUpRegForValue(I);
+      if (UseReg)
+        MRI.clearKillFlags(UseReg);
+
+      updateValueMap(I, SrcReg);
+      return true;
+    }
+  }
+
+  unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt);
+  if (!ResultReg)
     return false;
-  UpdateValueMap(I, ResultReg);
+
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
   EVT DestEVT = TLI.getValueType(I->getType(), true);
   if (!DestEVT.isSimple())
     return false;
@@ -1851,144 +4440,529 @@ bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
     return false;
 
   unsigned DivOpc;
-  bool is64bit = (DestVT == MVT::i64);
+  bool Is64bit = (DestVT == MVT::i64);
   switch (ISDOpcode) {
   default:
     return false;
   case ISD::SREM:
-    DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
+    DivOpc = Is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
     break;
   case ISD::UREM:
-    DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
+    DivOpc = Is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
     break;
   }
-  unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
+  unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
   unsigned Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
+  bool Src0IsKill = hasTrivialKill(I->getOperand(0));
 
   unsigned Src1Reg = getRegForValue(I->getOperand(1));
   if (!Src1Reg)
     return false;
+  bool Src1IsKill = hasTrivialKill(I->getOperand(1));
 
-  unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg)
-      .addReg(Src0Reg)
-      .addReg(Src1Reg);
+  const TargetRegisterClass *RC =
+      (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false,
+                                     Src1Reg, /*IsKill=*/false);
+  assert(QuotReg && "Unexpected DIV instruction emission failure.");
   // The remainder is computed as numerator - (quotient * denominator) using the
   // MSUB instruction.
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
-      .addReg(QuotReg)
-      .addReg(Src1Reg)
-      .addReg(Src0Reg);
-  UpdateValueMap(I, ResultReg);
+  unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true,
+                                        Src1Reg, Src1IsKill, Src0Reg,
+                                        Src0IsKill);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectMul(const Instruction *I) {
-  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
-  if (!SrcEVT.isSimple())
+bool AArch64FastISel::selectMul(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
     return false;
-  MVT SrcVT = SrcEVT.getSimpleVT();
 
-  // Must be simple value type.  Don't handle vectors.
-  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
-      SrcVT != MVT::i8)
+  if (VT.isVector())
+    return selectBinaryOp(I, ISD::MUL);
+
+  const Value *Src0 = I->getOperand(0);
+  const Value *Src1 = I->getOperand(1);
+  if (const auto *C = dyn_cast<ConstantInt>(Src0))
+    if (C->getValue().isPowerOf2())
+      std::swap(Src0, Src1);
+
+  // Try to simplify to a shift instruction.
+  if (const auto *C = dyn_cast<ConstantInt>(Src1))
+    if (C->getValue().isPowerOf2()) {
+      uint64_t ShiftVal = C->getValue().logBase2();
+      MVT SrcVT = VT;
+      bool IsZExt = true;
+      if (const auto *ZExt = dyn_cast<ZExtInst>(Src0)) {
+        if (!isIntExtFree(ZExt)) {
+          MVT VT;
+          if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) {
+            SrcVT = VT;
+            IsZExt = true;
+            Src0 = ZExt->getOperand(0);
+          }
+        }
+      } else if (const auto *SExt = dyn_cast<SExtInst>(Src0)) {
+        if (!isIntExtFree(SExt)) {
+          MVT VT;
+          if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) {
+            SrcVT = VT;
+            IsZExt = false;
+            Src0 = SExt->getOperand(0);
+          }
+        }
+      }
+
+      unsigned Src0Reg = getRegForValue(Src0);
+      if (!Src0Reg)
+        return false;
+      bool Src0IsKill = hasTrivialKill(Src0);
+
+      unsigned ResultReg =
+          emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt);
+
+      if (ResultReg) {
+        updateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+  bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+  bool Src1IsKill = hasTrivialKill(I->getOperand(1));
+
+  unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill);
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectShift(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeSupported(I->getType(), RetVT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (RetVT.isVector())
+    return selectOperator(I, I->getOpcode());
+
+  if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    unsigned ResultReg = 0;
+    uint64_t ShiftVal = C->getZExtValue();
+    MVT SrcVT = RetVT;
+    bool IsZExt = (I->getOpcode() == Instruction::AShr) ? false : true;
+    const Value *Op0 = I->getOperand(0);
+    if (const auto *ZExt = dyn_cast<ZExtInst>(Op0)) {
+      if (!isIntExtFree(ZExt)) {
+        MVT TmpVT;
+        if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) {
+          SrcVT = TmpVT;
+          IsZExt = true;
+          Op0 = ZExt->getOperand(0);
+        }
+      }
+    } else if (const auto *SExt = dyn_cast<SExtInst>(Op0)) {
+      if (!isIntExtFree(SExt)) {
+        MVT TmpVT;
+        if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) {
+          SrcVT = TmpVT;
+          IsZExt = false;
+          Op0 = SExt->getOperand(0);
+        }
+      }
+    }
+
+    unsigned Op0Reg = getRegForValue(Op0);
+    if (!Op0Reg)
+      return false;
+    bool Op0IsKill = hasTrivialKill(Op0);
+
+    switch (I->getOpcode()) {
+    default: llvm_unreachable("Unexpected instruction.");
+    case Instruction::Shl:
+      ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+      break;
+    case Instruction::AShr:
+      ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+      break;
+    case Instruction::LShr:
+      ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+      break;
+    }
+    if (!ResultReg)
+      return false;
+
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (!Op0Reg)
+    return false;
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (!Op1Reg)
+    return false;
+  bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+  unsigned ResultReg = 0;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unexpected instruction.");
+  case Instruction::Shl:
+    ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+    break;
+  case Instruction::AShr:
+    ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+    break;
+  case Instruction::LShr:
+    ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+    break;
+  }
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectBitCast(const Instruction *I) {
+  MVT RetVT, SrcVT;
+
+  if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT))
+    return false;
+  if (!isTypeLegal(I->getType(), RetVT))
     return false;
 
   unsigned Opc;
-  unsigned ZReg;
-  switch (SrcVT.SimpleTy) {
+  if (RetVT == MVT::f32 && SrcVT == MVT::i32)
+    Opc = AArch64::FMOVWSr;
+  else if (RetVT == MVT::f64 && SrcVT == MVT::i64)
+    Opc = AArch64::FMOVXDr;
+  else if (RetVT == MVT::i32 && SrcVT == MVT::f32)
+    Opc = AArch64::FMOVSWr;
+  else if (RetVT == MVT::i64 && SrcVT == MVT::f64)
+    Opc = AArch64::FMOVDXr;
+  else
+    return false;
+
+  const TargetRegisterClass *RC = nullptr;
+  switch (RetVT.SimpleTy) {
+  default: llvm_unreachable("Unexpected value type.");
+  case MVT::i32: RC = &AArch64::GPR32RegClass; break;
+  case MVT::i64: RC = &AArch64::GPR64RegClass; break;
+  case MVT::f32: RC = &AArch64::FPR32RegClass; break;
+  case MVT::f64: RC = &AArch64::FPR64RegClass; break;
+  }
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (!Op0Reg)
+    return false;
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+  unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill);
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectFRem(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeLegal(I->getType(), RetVT))
+    return false;
+
+  RTLIB::Libcall LC;
+  switch (RetVT.SimpleTy) {
   default:
     return false;
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-    ZReg = AArch64::WZR;
-    Opc = AArch64::MADDWrrr;
-    SrcVT = MVT::i32;
+  case MVT::f32:
+    LC = RTLIB::REM_F32;
     break;
-  case MVT::i64:
-    ZReg = AArch64::XZR;
-    Opc = AArch64::MADDXrrr;
+  case MVT::f64:
+    LC = RTLIB::REM_F64;
     break;
   }
 
+  ArgListTy Args;
+  Args.reserve(I->getNumOperands());
+
+  // Populate the argument list.
+  for (auto &Arg : I->operands()) {
+    ArgListEntry Entry;
+    Entry.Val = Arg;
+    Entry.Ty = Arg->getType();
+    Args.push_back(Entry);
+  }
+
+  CallLoweringInfo CLI;
+  CLI.setCallee(TLI.getLibcallCallingConv(LC), I->getType(),
+                TLI.getLibcallName(LC), std::move(Args));
+  if (!lowerCallTo(CLI))
+    return false;
+  updateValueMap(I, CLI.ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectSDiv(const Instruction *I) {
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  if (!isa<ConstantInt>(I->getOperand(1)))
+    return selectBinaryOp(I, ISD::SDIV);
+
+  const APInt &C = cast<ConstantInt>(I->getOperand(1))->getValue();
+  if ((VT != MVT::i32 && VT != MVT::i64) || !C ||
+      !(C.isPowerOf2() || (-C).isPowerOf2()))
+    return selectBinaryOp(I, ISD::SDIV);
+
+  unsigned Lg2 = C.countTrailingZeros();
   unsigned Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
+  bool Src0IsKill = hasTrivialKill(I->getOperand(0));
 
-  unsigned Src1Reg = getRegForValue(I->getOperand(1));
-  if (!Src1Reg)
+  if (cast<BinaryOperator>(I)->isExact()) {
+    unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2);
+    if (!ResultReg)
+      return false;
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  int64_t Pow2MinusOne = (1ULL << Lg2) - 1;
+  unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne);
+  if (!AddReg)
     return false;
 
-  // Create the base instruction, then add the operands.
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(Src0Reg)
-      .addReg(Src1Reg)
-      .addReg(ZReg);
-  UpdateValueMap(I, ResultReg);
+  // (Src0 < 0) ? Pow2 - 1 : 0;
+  if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0))
+    return false;
+
+  unsigned SelectOpc;
+  const TargetRegisterClass *RC;
+  if (VT == MVT::i64) {
+    SelectOpc = AArch64::CSELXr;
+    RC = &AArch64::GPR64RegClass;
+  } else {
+    SelectOpc = AArch64::CSELWr;
+    RC = &AArch64::GPR32RegClass;
+  }
+  unsigned SelectReg =
+      fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg,
+                       Src0IsKill, AArch64CC::LT);
+  if (!SelectReg)
+    return false;
+
+  // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also
+  // negate the result.
+  unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+  unsigned ResultReg;
+  if (C.isNegative())
+    ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true,
+                              SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2);
+  else
+    ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2);
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+/// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We
+/// have to duplicate it for AArch64, because otherwise we would fail during the
+/// sign-extend emission.
+std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
+  unsigned IdxN = getRegForValue(Idx);
+  if (IdxN == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return std::pair<unsigned, bool>(0, false);
+
+  bool IdxNIsKill = hasTrivialKill(Idx);
+
+  // If the index is smaller or larger than intptr_t, truncate or extend it.
+  MVT PtrVT = TLI.getPointerTy();
+  EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
+  if (IdxVT.bitsLT(PtrVT)) {
+    IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false);
+    IdxNIsKill = true;
+  } else if (IdxVT.bitsGT(PtrVT))
+    llvm_unreachable("AArch64 FastISel doesn't support types larger than i64");
+  return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
+}
+
+/// This is mostly a copy of the existing FastISel GEP code, but we have to
+/// duplicate it for AArch64, because otherwise we would bail out even for
+/// simple cases. This is because the standard fastEmit functions don't cover
+/// MUL at all and ADD is lowered very inefficientily.
+bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
+  unsigned N = getRegForValue(I->getOperand(0));
+  if (!N)
+    return false;
+  bool NIsKill = hasTrivialKill(I->getOperand(0));
+
+  // Keep a running tab of the total offset to coalesce multiple N = N + Offset
+  // into a single N = N + TotalOffset.
+  uint64_t TotalOffs = 0;
+  Type *Ty = I->getOperand(0)->getType();
+  MVT VT = TLI.getPointerTy();
+  for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) {
+    const Value *Idx = *OI;
+    if (auto *StTy = dyn_cast<StructType>(Ty)) {
+      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+      // N = N + Offset
+      if (Field)
+        TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
+      Ty = StTy->getElementType(Field);
+    } else {
+      Ty = cast<SequentialType>(Ty)->getElementType();
+      // If this is a constant subscript, handle it quickly.
+      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero())
+          continue;
+        // N = N + Offset
+        TotalOffs +=
+            DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
+        continue;
+      }
+      if (TotalOffs) {
+        N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+        if (!N)
+          return false;
+        NIsKill = true;
+        TotalOffs = 0;
+      }
+
+      // N = N + Idx * ElementSize;
+      uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+      std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
+      unsigned IdxN = Pair.first;
+      bool IdxNIsKill = Pair.second;
+      if (!IdxN)
+        return false;
+
+      if (ElementSize != 1) {
+        unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize);
+        if (!C)
+          return false;
+        IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true);
+        if (!IdxN)
+          return false;
+        IdxNIsKill = true;
+      }
+      N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+      if (!N)
+        return false;
+    }
+  }
+  if (TotalOffs) {
+    N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+    if (!N)
+      return false;
+  }
+  updateValueMap(I, N);
   return true;
 }
 
-bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) {
+bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
   switch (I->getOpcode()) {
   default:
     break;
-  case Instruction::Load:
-    return SelectLoad(I);
-  case Instruction::Store:
-    return SelectStore(I);
+  case Instruction::Add:
+  case Instruction::Sub:
+    return selectAddSub(I);
+  case Instruction::Mul:
+    return selectMul(I);
+  case Instruction::SDiv:
+    return selectSDiv(I);
+  case Instruction::SRem:
+    if (!selectBinaryOp(I, ISD::SREM))
+      return selectRem(I, ISD::SREM);
+    return true;
+  case Instruction::URem:
+    if (!selectBinaryOp(I, ISD::UREM))
+      return selectRem(I, ISD::UREM);
+    return true;
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return selectShift(I);
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return selectLogicalOp(I);
   case Instruction::Br:
-    return SelectBranch(I);
+    return selectBranch(I);
   case Instruction::IndirectBr:
-    return SelectIndirectBr(I);
-  case Instruction::FCmp:
-  case Instruction::ICmp:
-    return SelectCmp(I);
-  case Instruction::Select:
-    return SelectSelect(I);
-  case Instruction::FPExt:
-    return SelectFPExt(I);
-  case Instruction::FPTrunc:
-    return SelectFPTrunc(I);
+    return selectIndirectBr(I);
+  case Instruction::BitCast:
+    if (!FastISel::selectBitCast(I))
+      return selectBitCast(I);
+    return true;
   case Instruction::FPToSI:
-    return SelectFPToInt(I, /*Signed=*/true);
+    if (!selectCast(I, ISD::FP_TO_SINT))
+      return selectFPToInt(I, /*Signed=*/true);
+    return true;
   case Instruction::FPToUI:
-    return SelectFPToInt(I, /*Signed=*/false);
+    return selectFPToInt(I, /*Signed=*/false);
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return selectIntExt(I);
+  case Instruction::Trunc:
+    if (!selectCast(I, ISD::TRUNCATE))
+      return selectTrunc(I);
+    return true;
+  case Instruction::FPExt:
+    return selectFPExt(I);
+  case Instruction::FPTrunc:
+    return selectFPTrunc(I);
   case Instruction::SIToFP:
-    return SelectIntToFP(I, /*Signed=*/true);
+    if (!selectCast(I, ISD::SINT_TO_FP))
+      return selectIntToFP(I, /*Signed=*/true);
+    return true;
   case Instruction::UIToFP:
-    return SelectIntToFP(I, /*Signed=*/false);
-  case Instruction::SRem:
-    return SelectRem(I, ISD::SREM);
-  case Instruction::URem:
-    return SelectRem(I, ISD::UREM);
-  case Instruction::Call:
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
-      return SelectIntrinsicCall(*II);
-    return SelectCall(I);
+    return selectIntToFP(I, /*Signed=*/false);
+  case Instruction::Load:
+    return selectLoad(I);
+  case Instruction::Store:
+    return selectStore(I);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return selectCmp(I);
+  case Instruction::Select:
+    return selectSelect(I);
   case Instruction::Ret:
-    return SelectRet(I);
-  case Instruction::Trunc:
-    return SelectTrunc(I);
-  case Instruction::ZExt:
-  case Instruction::SExt:
-    return SelectIntExt(I);
-  case Instruction::Mul:
-    // FIXME: This really should be handled by the target-independent selector.
-    return SelectMul(I);
+    return selectRet(I);
+  case Instruction::FRem:
+    return selectFRem(I);
+  case Instruction::GetElementPtr:
+    return selectGetElementPtr(I);
   }
-  return false;
+
+  // fall-back to target-independent instruction selection.
+  return selectOperator(I, I->getOpcode());
   // Silence warnings.
   (void)&CC_AArch64_DarwinPCS_VarArg;
 }
 
 namespace llvm {
-llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo,
-                                        const TargetLibraryInfo *libInfo) {
-  return new AArch64FastISel(funcInfo, libInfo);
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                        const TargetLibraryInfo *LibInfo) {
+  return new AArch64FastISel(FuncInfo, LibInfo);
 }
 }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 9c33717..66aa216 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -17,16 +17,16 @@
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -86,13 +86,14 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
 
 #ifndef NDEBUG
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   assert(!RegInfo->needsStackRealignment(MF) &&
          "No stack realignment on AArch64!");
 #endif
 
   return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
+          MFI->isFrameAddressTaken() || MFI->hasStackMap() ||
+          MFI->hasPatchPoint());
 }
 
 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
@@ -109,13 +110,13 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
   const AArch64InstrInfo *TII =
-      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
   DebugLoc DL = I->getDebugLoc();
   int Opc = I->getOpcode();
   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
 
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   if (!TFI->hasReservedCallFrame(MF)) {
     unsigned Align = getStackAlignment();
 
@@ -131,7 +132,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
       // FIXME: in-function stack adjustment for calls is limited to 24-bits
       // because there's no guaranteed temporary register available.
       //
-      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+      // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
       // 1) For offset <= 12-bit, we use LSL #0
       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
       // LSL #0, and the other uses LSL #12.
@@ -158,7 +159,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   // Add callee saved registers to move list.
@@ -166,7 +167,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
   if (CSI.empty())
     return;
 
-  const DataLayout *TD = MF.getTarget().getDataLayout();
+  const DataLayout *TD = MF.getSubtarget().getDataLayout();
   bool HasFP = hasFP(MF);
 
   // Calculate amount of bytes used for return address storing.
@@ -195,7 +196,8 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
     unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
         nullptr, DwarfReg, Offset - TotalSkipped));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
   }
 }
 
@@ -205,8 +207,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+      MF.getSubtarget().getRegisterInfo());
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
@@ -233,7 +235,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     } else if (NumBytes) {
       ++NumRedZoneFunctions;
     }
@@ -300,7 +303,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
     TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
 
   if (needsFrameMoves) {
-    const DataLayout *TD = MF.getTarget().getDataLayout();
+    const DataLayout *TD = MF.getSubtarget().getDataLayout();
     const int StackGrowth = -TD->getPointerSize(0);
     unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
@@ -376,26 +379,30 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
 
       // Record the location of the stored LR
       unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
 
       // Record the location of the stored FP
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     } else {
       // Encode the stack size of the leaf function.
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
     }
 
     // Now emit the moves for whatever callee saved regs we have.
@@ -435,9 +442,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const AArch64InstrInfo *TII =
-      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
+      MF.getSubtarget().getRegisterInfo());
   DebugLoc DL = MBBI->getDebugLoc();
   unsigned RetOpcode = MBBI->getOpcode();
 
@@ -548,7 +555,7 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
                                                      bool PreferFP) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
+      MF.getSubtarget().getRegisterInfo());
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   int FPOffset = MFI->getObjectOffset(FI) + 16;
   int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
@@ -617,7 +624,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   unsigned Count = CSI.size();
   DebugLoc DL;
   assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
@@ -693,7 +700,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   unsigned Count = CSI.size();
   DebugLoc DL;
   assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
@@ -761,7 +768,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
 void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
     MachineFunction &MF, RegScavenger *RS) const {
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
+      MF.getSubtarget().getRegisterInfo());
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   MachineRegisterInfo *MRI = &MF.getRegInfo();
   SmallVector<unsigned, 4> UnspilledCSGPRs;
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 7686e6f..df3875f 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64_FRAMELOWERING_H
-#define AArch64_FRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
 
 #include "llvm/Target/TargetFrameLowering.h"
 
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3f49fab..bb2e1e2 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -303,7 +303,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
 
 /// \brief Determine wether it is worth to fold V into an extended register.
 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the a value is used at least twice, unless we are optimizing
+  // it hurts if the value is used at least twice, unless we are optimizing
   // for code size.
   if (ForCodeSize || V.hasOneUse())
     return true;
@@ -569,6 +569,27 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
   return isWorthFolding(N);
 }
 
+/// If there's a use of this ADDlow that's not itself a load/store then we'll
+/// need to create a real ADD instruction from it anyway and there's no point in
+/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
+/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
+/// leads to duplaicated ADRP instructions.
+static bool isWorthFoldingADDlow(SDValue N) {
+  for (auto Use : N->uses()) {
+    if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
+        Use->getOpcode() != ISD::ATOMIC_LOAD &&
+        Use->getOpcode() != ISD::ATOMIC_STORE)
+      return false;
+
+    // ldar and stlr have much more restrictive addressing modes (just a
+    // register).
+    if (cast<MemSDNode>(Use)->getOrdering() > Monotonic)
+      return false;
+  }
+
+  return true;
+}
+
 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
 /// immediate" address.  The "Size" argument is the size in bytes of the memory
 /// reference, which determines the scale.
@@ -582,7 +603,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
     return true;
   }
 
-  if (N.getOpcode() == AArch64ISD::ADDlow) {
+  if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
     GlobalAddressSDNode *GAN =
         dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
     Base = N.getOperand(0);
@@ -594,7 +615,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
     unsigned Alignment = GV->getAlignment();
     const DataLayout *DL = TLI->getDataLayout();
     Type *Ty = GV->getType()->getElementType();
-    if (Alignment == 0 && Ty->isSized() && !Subtarget->isTargetDarwin())
+    if (Alignment == 0 && Ty->isSized())
       Alignment = DL->getABITypeAlignment(Ty);
 
     if (Alignment >= Size)
@@ -777,6 +798,21 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
   return false;
 }
 
+// Check if the given immediate is preferred by ADD. If an immediate can be
+// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
+// encoded by one MOVZ, return true.
+static bool isPreferredADD(int64_t ImmOff) {
+  // Constant in [0x0, 0xfff] can be encoded in ADD.
+  if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
+    return true;
+  // Check if it can be encoded in an "ADD LSL #12".
+  if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
+    // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
+    return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
+           (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
+  return false;
+}
+
 bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
                                             SDValue &Base, SDValue &Offset,
                                             SDValue &SignExtend,
@@ -786,11 +822,6 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   SDValue LHS = N.getOperand(0);
   SDValue RHS = N.getOperand(1);
 
-  // We don't want to match immediate adds here, because they are better lowered
-  // to the register-immediate addressing modes.
-  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
-    return false;
-
   // Check if this particular node is reused in any non-memory related
   // operation.  If yes, do not try to fold this node into the address
   // computation, since the computation will be kept.
@@ -800,6 +831,36 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
       return false;
   }
 
+  // Watch out if RHS is a wide immediate, it can not be selected into
+  // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
+  // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
+  // instructions like:
+  //     MOV  X0, WideImmediate
+  //     ADD  X1, BaseReg, X0
+  //     LDR  X2, [X1, 0]
+  // For such situation, using [BaseReg, XReg] addressing mode can save one
+  // ADD/SUB:
+  //     MOV  X0, WideImmediate
+  //     LDR  X2, [BaseReg, X0]
+  if (isa<ConstantSDNode>(RHS)) {
+    int64_t ImmOff = (int64_t)dyn_cast<ConstantSDNode>(RHS)->getZExtValue();
+    unsigned Scale = Log2_32(Size);
+    // Skip the immediate can be seleced by load/store addressing mode.
+    // Also skip the immediate can be encoded by a single ADD (SUB is also
+    // checked by using -ImmOff).
+    if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
+        isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
+      return false;
+
+    SDLoc DL(N.getNode());
+    SDValue Ops[] = { RHS };
+    SDNode *MOVI =
+        CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
+    SDValue MOVIV = SDValue(MOVI, 0);
+    // This ADD of two X register will be selected into [Reg+Reg] mode.
+    N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
+  }
+
   // Remember if it is worth folding N when it produces extended register.
   bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
 
@@ -1381,20 +1442,21 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
   return true;
 }
 
-static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     unsigned &LSB, unsigned &MSB) {
-  // We are looking for the following pattern which basically extracts a single
-  // bit from the source value and places it in the LSB of the destination
-  // value, all other bits of the destination value or set to zero:
+static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
+                                          SDValue &Opd0, unsigned &LSB,
+                                          unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts several
+  // continuous bits from the source value and places it from the LSB of the
+  // destination value, all other bits of the destination value or set to zero:
   //
   // Value2 = AND Value, MaskImm
   // SRL Value2, ShiftImm
   //
-  // with MaskImm >> ShiftImm == 1.
+  // with MaskImm >> ShiftImm to search for the bit width.
   //
   // This gets selected into a single UBFM:
   //
-  // UBFM Value, ShiftImm, ShiftImm
+  // UBFM Value, ShiftImm, BitWide + Srl_imm -1
   //
 
   if (N->getOpcode() != ISD::SRL)
@@ -1410,15 +1472,16 @@ static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   if (!isIntImmediate(N->getOperand(1), Srl_imm))
     return false;
 
-  // Check whether we really have a one bit extract here.
-  if (And_mask >> Srl_imm == 0x1) {
+  // Check whether we really have several bits extract here.
+  unsigned BitWide = 64 - CountLeadingOnes_64(~(And_mask >> Srl_imm));
+  if (BitWide && isMask_64(And_mask >> Srl_imm)) {
     if (N->getValueType(0) == MVT::i32)
       Opc = AArch64::UBFMWri;
     else
       Opc = AArch64::UBFMXri;
 
-    LSB = MSB = Srl_imm;
-
+    LSB = Srl_imm;
+    MSB = BitWide + Srl_imm - 1;
     return true;
   }
 
@@ -1439,8 +1502,8 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   assert((VT == MVT::i32 || VT == MVT::i64) &&
          "Type checking must have been done before calling this function");
 
-  // Check for AND + SRL doing a one bit extract.
-  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+  // Check for AND + SRL doing several bits extract.
+  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
     return true;
 
   // we're looking for a shift of a shift
@@ -2116,7 +2179,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     case 32:
       SubReg = AArch64::ssub;
       break;
-    case 16: // FALLTHROUGH
+    case 16:
+      SubReg = AArch64::hsub;
+      break;
     case 8:
       llvm_unreachable("unexpected zext-requiring extract element!");
     }
@@ -2204,9 +2269,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
@@ -2222,9 +2287,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
@@ -2240,9 +2305,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
@@ -2258,9 +2323,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
@@ -2276,9 +2341,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
@@ -2294,9 +2359,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16  || VT == MVT::v8f16)
         return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
@@ -2312,9 +2377,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
@@ -2330,9 +2395,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
@@ -2348,9 +2413,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
@@ -2364,7 +2429,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_neon_ld2lane:
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectLoadLane(Node, 2, AArch64::LD2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectLoadLane(Node, 2, AArch64::LD2i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2376,7 +2442,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_neon_ld3lane:
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectLoadLane(Node, 3, AArch64::LD3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectLoadLane(Node, 3, AArch64::LD3i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2388,7 +2455,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_neon_ld4lane:
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectLoadLane(Node, 4, AArch64::LD4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectLoadLane(Node, 4, AArch64::LD4i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2448,9 +2516,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectStore(Node, 2, AArch64::ST1Twov8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 2, AArch64::ST1Twov16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 2, AArch64::ST1Twov4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 2, AArch64::ST1Twov8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 2, AArch64::ST1Twov2s);
@@ -2467,9 +2535,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectStore(Node, 3, AArch64::ST1Threev8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 3, AArch64::ST1Threev16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 3, AArch64::ST1Threev4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 3, AArch64::ST1Threev8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 3, AArch64::ST1Threev2s);
@@ -2486,9 +2554,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectStore(Node, 4, AArch64::ST1Fourv8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 4, AArch64::ST1Fourv16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 4, AArch64::ST1Fourv4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 4, AArch64::ST1Fourv8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 4, AArch64::ST1Fourv2s);
@@ -2505,9 +2573,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectStore(Node, 2, AArch64::ST2Twov8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 2, AArch64::ST2Twov16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 2, AArch64::ST2Twov4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 2, AArch64::ST2Twov8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 2, AArch64::ST2Twov2s);
@@ -2524,9 +2592,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectStore(Node, 3, AArch64::ST3Threev8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 3, AArch64::ST3Threev16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 3, AArch64::ST3Threev4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 3, AArch64::ST3Threev8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 3, AArch64::ST3Threev2s);
@@ -2543,9 +2611,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
         return SelectStore(Node, 4, AArch64::ST4Fourv8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 4, AArch64::ST4Fourv16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 4, AArch64::ST4Fourv4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 4, AArch64::ST4Fourv8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 4, AArch64::ST4Fourv2s);
@@ -2560,7 +2628,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_neon_st2lane: {
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectStoreLane(Node, 2, AArch64::ST2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectStoreLane(Node, 2, AArch64::ST2i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2573,7 +2642,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_neon_st3lane: {
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectStoreLane(Node, 3, AArch64::ST3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectStoreLane(Node, 3, AArch64::ST3i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2586,7 +2656,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_neon_st4lane: {
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectStoreLane(Node, 4, AArch64::ST4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectStoreLane(Node, 4, AArch64::ST4i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2603,9 +2674,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
@@ -2622,9 +2693,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
@@ -2641,9 +2712,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
@@ -2660,9 +2731,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
@@ -2679,9 +2750,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
@@ -2698,9 +2769,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
@@ -2717,9 +2788,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
@@ -2736,9 +2807,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
@@ -2755,9 +2826,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
@@ -2774,9 +2845,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
@@ -2791,7 +2862,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   case AArch64ISD::LD1LANEpost: {
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2804,7 +2876,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   case AArch64ISD::LD2LANEpost: {
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2817,7 +2890,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   case AArch64ISD::LD3LANEpost: {
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2830,7 +2904,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   case AArch64ISD::LD4LANEpost: {
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2846,9 +2921,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
@@ -2866,9 +2941,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
@@ -2886,9 +2961,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
@@ -2906,9 +2981,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
@@ -2926,9 +3001,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
@@ -2946,9 +3021,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
@@ -2964,7 +3039,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     VT = Node->getOperand(1).getValueType();
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2978,7 +3054,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     VT = Node->getOperand(1).getValueType();
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2992,7 +3069,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     VT = Node->getOperand(1).getValueType();
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index f2004ea..0d44f99 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12,9 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64ISelLowering.h"
+#include "AArch64CallingConvention.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64PerfectShuffle.h"
 #include "AArch64Subtarget.h"
-#include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -38,10 +39,12 @@ using namespace llvm;
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 
+namespace {
 enum AlignMode {
   StrictAlign,
   NoStrictAlign
 };
+}
 
 static cl::opt<AlignMode>
 Align(cl::desc("Load/store alignment support"),
@@ -64,18 +67,9 @@ EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
                          cl::desc("Allow AArch64 SLI/SRI formation"),
                          cl::init(false));
 
-//===----------------------------------------------------------------------===//
-// AArch64 Lowering public interface.
-//===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
-  if (TT.isOSBinFormatMachO())
-    return new AArch64_MachoTargetObjectFile();
-
-  return new AArch64_ELFTargetObjectFile();
-}
 
-AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
+    : TargetLowering(TM) {
   Subtarget = &TM.getSubtarget<AArch64Subtarget>();
 
   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
@@ -106,6 +100,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
     addDRTypeForNEON(MVT::v2i32);
     addDRTypeForNEON(MVT::v1i64);
     addDRTypeForNEON(MVT::v1f64);
+    addDRTypeForNEON(MVT::v4f16);
 
     addQRTypeForNEON(MVT::v4f32);
     addQRTypeForNEON(MVT::v2f64);
@@ -113,6 +108,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
     addQRTypeForNEON(MVT::v8i16);
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
+    addQRTypeForNEON(MVT::v8f16);
   }
 
   // Compute derived properties from the register classes
@@ -278,6 +274,94 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
+  // f16 is storage-only, so we promote operations to f32 if we know this is
+  // valid, and ignore them otherwise. The operations not mentioned here will
+  // fail to select, but this is not a major problem as no source language
+  // should be emitting native f16 operations yet.
+  setOperationAction(ISD::FADD, MVT::f16, Promote);
+  setOperationAction(ISD::FDIV, MVT::f16, Promote);
+  setOperationAction(ISD::FMUL, MVT::f16, Promote);
+  setOperationAction(ISD::FSUB, MVT::f16, Promote);
+
+  // v4f16 is also a storage-only type, so promote it to v4f32 when that is
+  // known to be safe.
+  setOperationAction(ISD::FADD, MVT::v4f16, Promote);
+  setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
+  setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
+  setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
+  AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
+
+  // Expand all other v4f16 operations.
+  // FIXME: We could generate better code by promoting some operations to
+  // a pair of v4f32s
+  setOperationAction(ISD::FABS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v4f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
+
+
+  // v8f16 is also a storage-only type, so expand it.
+  setOperationAction(ISD::FABS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v8f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
+
   // AArch64 has implementations of a lot of rounding-like FP operations.
   static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
   for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
@@ -303,13 +387,24 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   }
 
+  // Make floating-point constants legal for the large code model, so they don't
+  // become loads from the constant pool.
+  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  }
+
   // AArch64 does not have floating-point extending loads, i1 sign-extending
   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
+  }
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
+
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
@@ -439,30 +534,31 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
 
     // AArch64 doesn't have MUL.2d:
     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    // Custom handling for some quad-vector types to detect MULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     // Likewise, narrowing and extending vector loads/stores aren't handled
     // directly.
-    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
-      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
-                         Expand);
-
-      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-
-      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
-
-      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-        setTruncStoreAction((MVT::SimpleValueType)VT,
-                            (MVT::SimpleValueType)InnerVT, Expand);
-      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    for (MVT VT : MVT::vector_valuetypes()) {
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+      setOperationAction(ISD::MULHS, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+      setOperationAction(ISD::BSWAP, VT, Expand);
+
+      for (MVT InnerVT : MVT::vector_valuetypes()) {
+        setTruncStoreAction(VT, InnerVT, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+      }
     }
 
     // AArch64 has implementations of a lot of rounding-like FP operations.
@@ -477,16 +573,20 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
       setOperationAction(ISD::FROUND, Ty, Legal);
     }
   }
+
+  // Prefer likely predicted branches to selects on out-of-order cores.
+  if (Subtarget->isCortexA57())
+    PredictableSelectIsExpensive = true;
 }
 
 void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32) {
+  if (VT == MVT::v2f32 || VT == MVT::v4f16) {
     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
 
     setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
 
@@ -523,7 +623,8 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
   setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
-  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+  for (MVT InnerVT : MVT::all_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
 
   // CNT supports only B element sizes.
   if (VT != MVT::v8i8 && VT != MVT::v16i8)
@@ -727,6 +828,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
   case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
   case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::NVCAST:            return "AArch64ISD::NVCAST";
   case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
   case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
   case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
@@ -756,6 +858,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
   case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
+  case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
   }
 }
 
@@ -774,7 +878,8 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   // EndBB:
   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
 
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineFunction *MF = MBB->getParent();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   DebugLoc DL = MI->getDebugLoc();
@@ -1020,6 +1125,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  SDValue Cmp;
+  AArch64CC::CondCode AArch64CC;
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     EVT VT = RHS.getValueType();
     uint64_t C = RHSC->getZExtValue();
@@ -1051,9 +1158,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
         break;
       case ISD::SETLE:
       case ISD::SETGT:
-        if ((VT == MVT::i32 && C != 0x7fffffff &&
+        if ((VT == MVT::i32 && C != INT32_MAX &&
              isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+            (VT == MVT::i64 && C != INT64_MAX &&
              isLegalArithImmed(C + 1ULL))) {
           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1062,9 +1169,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
         break;
       case ISD::SETULE:
       case ISD::SETUGT:
-        if ((VT == MVT::i32 && C != 0xffffffff &&
+        if ((VT == MVT::i32 && C != UINT32_MAX &&
              isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+            (VT == MVT::i64 && C != UINT64_MAX &&
              isLegalArithImmed(C + 1ULL))) {
           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1074,9 +1181,45 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       }
     }
   }
-
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+  // For the i8 operand, the largest immediate is 255, so this can be easily
+  // encoded in the compare instruction. For the i16 operand, however, the
+  // largest immediate cannot be encoded in the compare.
+  // Therefore, use a sign extending load and cmn to avoid materializing the -1
+  // constant. For example,
+  // movz w1, #65535
+  // ldrh w0, [x0, #0]
+  // cmp w0, w1
+  // >
+  // ldrsh w0, [x0, #0]
+  // cmn w0, #1
+  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+  // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
+  // both the LHS and RHS are truely zero extended and to make sure the
+  // transformation is profitable.
+  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
+    if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
+        isa<LoadSDNode>(LHS)) {
+      if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+          cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+          LHS.getNode()->hasNUsesOfValue(1, 0)) {
+        int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+        if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+          SDValue SExt =
+              DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+                          DAG.getValueType(MVT::i16));
+          Cmp = emitComparison(SExt,
+                               DAG.getConstant(ValueofRHS, RHS.getValueType()),
+                               CC, dl, DAG);
+          AArch64CC = changeIntCCToAArch64CC(CC);
+          AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+          return Cmp;
+        }
+      }
+    }
+  }
+  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC = changeIntCCToAArch64CC(CC);
   AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
   return Cmp;
 }
@@ -1333,8 +1476,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  // The data thing is not used.
-  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
 
   bool IsStream = !Locality;
   // When the locality number is set
@@ -1349,6 +1491,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
 
   // built the mask value encoding the expected behavior.
   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (!IsData << 3) |     // IsDataCache bit
                    (Locality << 1) |    // Cache level bits
                    (unsigned)IsStream;  // Stream bit
   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
@@ -1400,7 +1543,10 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
 
   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
     SDLoc dl(Op);
-    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    MVT ExtVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+                         VT.getVectorNumElements());
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
   }
 
@@ -1505,7 +1651,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
 
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
     .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
@@ -1529,6 +1675,197 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
       0);
 }
 
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
+static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
+                                                 const EVT &OrigTy,
+                                                 const EVT &ExtTy,
+                                                 unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  EVT VT = N->getValueType(0);
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        if (!isIntN(HalfSize, C->getSExtValue()))
+          return false;
+      } else {
+        if (!isUIntN(HalfSize, C->getZExtValue()))
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
+                                             N->getOperand(0)->getValueType(0),
+                                             N->getValueType(0),
+                                             N->getOpcode());
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+}
+
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
+    NewOpc = AArch64ISD::SMULL;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = AArch64ISD::UMULL;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = AArch64ISD::SMULL;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
+
+  // Legalize to a S/UMULL instruction
+  SDLoc DL(Op);
+  SDValue Op0;
+  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
+  if (!isMLA) {
+    Op0 = skipExtensionForVectorMULL(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
+  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
+  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
 
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
@@ -1629,6 +1966,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFP_TO_INT(Op, DAG);
   case ISD::FSINCOS:
     return LowerFSINCOS(Op, DAG);
+  case ISD::MUL:
+    return LowerMUL(Op, DAG);
   }
 }
 
@@ -1643,8 +1982,7 @@ unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
 
 #include "AArch64GenCallingConv.inc"
 
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
+/// Selects the correct CCAssignFn for a given CallingConvention value.
 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                      bool IsVarArg) const {
   switch (CC) {
@@ -1669,8 +2007,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   // At this point, Ins[].VT may already be promoted to i32. To correctly
   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
@@ -1774,10 +2112,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
       unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
 
       uint32_t BEAlign = 0;
-      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+      if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
+          !Ins[i].Flags.isInConsecutiveRegs())
         BEAlign = 8 - ArgSize;
 
       int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
@@ -1809,7 +2148,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
       ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
                                 MachinePointerInfo::getFixedStack(FI),
-                                MemVT, false, false, false, nullptr);
+                                MemVT, false, false, false, 0);
 
       InVals.push_back(ArgValue);
     }
@@ -1941,8 +2280,8 @@ SDValue AArch64TargetLowering::LowerCallResult(
                           : RetCC_AArch64_AAPCS;
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2011,6 +2350,21 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     return false;
   }
 
+  // Externally-defined functions with weak linkage should not be
+  // tail-called on AArch64 when the OS does not support dynamic
+  // pre-emption of symbols, as the AAELF spec requires normal calls
+  // to undefined weak functions to be replaced with a NOP or jump to the
+  // next instruction. The behaviour of branch instructions in this
+  // situation (as used for tail calls) is implementation-defined, so we
+  // cannot rely on the linker replacing the tail call with a return.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    const Triple TT(getTargetMachine().getTargetTriple());
+    if (GV->hasExternalWeakLinkage() &&
+        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
+      return false;
+  }
+
   // Now we search for cases where we can use a tail call without changing the
   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
   // concept.
@@ -2028,8 +2382,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
 
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2041,13 +2395,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   // results are returned in the same way as what the caller expects.
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+                    *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
 
     SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+                    *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -2072,8 +2426,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
 
@@ -2170,8 +2524,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   if (IsVarArg) {
     // Handle fixed and variable vector arguments differently.
@@ -2316,9 +2670,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       // common case. It should also work for fundamental types too.
       uint32_t BEAlign = 0;
       unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
-                                        : VA.getLocVT().getSizeInBits();
+                                        : VA.getValVT().getSizeInBits();
       OpSize = (OpSize + 7) / 8;
-      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
+          !Flags.isInConsecutiveRegs()) {
         if (OpSize < 8)
           BEAlign = 8 - OpSize;
       }
@@ -2350,8 +2705,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
             DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
         SDValue Cpy = DAG.getMemcpy(
             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
-            /*isVolatile = */ false,
-            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+            /*isVol = */ false,
+            /*AlwaysInline = */ false, DstInfo, MachinePointerInfo());
 
         MemOpChains.push_back(Cpy);
       } else {
@@ -2440,7 +2795,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const AArch64RegisterInfo *ARI =
       static_cast<const AArch64RegisterInfo *>(TRI);
   if (IsThisReturn) {
@@ -2494,7 +2850,7 @@ bool AArch64TargetLowering::CanLowerReturn(
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC);
 }
 
@@ -2508,8 +2864,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC);
 
   // Copy the result values into the output registers.
@@ -2560,7 +2916,8 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                   SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
   SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
 
@@ -2575,6 +2932,25 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
     return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
   }
 
+  if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
+    assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+           "use of MO_CONSTPOOL only supported on small model");
+    SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
+    SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
+                                     MachinePointerInfo::getConstantPool(),
+                                     /*isVolatile=*/ false,
+                                     /*isNonTemporal=*/ true,
+                                     /*isInvariant=*/ true, 8);
+    if (GN->getOffset() != 0)
+      return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
+                         DAG.getConstant(GN->getOffset(), PtrVT));
+    return GlobalAddr;
+  }
+
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     const unsigned char MO_NC = AArch64II::MO_NC;
     return DAG.getNode(
@@ -2651,7 +3027,8 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const AArch64RegisterInfo *ARI =
       static_cast<const AArch64RegisterInfo *>(TRI);
   const uint32_t *Mask = ARI->getTLSCallPreservedMask();
@@ -2701,7 +3078,8 @@ SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const AArch64RegisterInfo *ARI =
       static_cast<const AArch64RegisterInfo *>(TRI);
   const uint32_t *Mask = ARI->getTLSCallPreservedMask();
@@ -2916,11 +3294,6 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
           uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
         }
@@ -2936,18 +3309,29 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
           uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBNZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
         }
 
         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
+        // Don't combine AND since emitComparison converts the AND to an ANDS
+        // (a.k.a. TST) and the test in the test bit and branch instruction
+        // becomes redundant.  This would also increase register pressure.
+        uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+        return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
+                           DAG.getConstant(Mask, MVT::i64), Dest);
       }
     }
+    if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
+        LHS.getOpcode() != ISD::AND) {
+      // Don't combine AND since emitComparison converts the AND to an ANDS
+      // (a.k.a. TST) and the test in the test bit and branch instruction
+      // becomes redundant.  This would also increase register pressure.
+      uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+      return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
+                         DAG.getConstant(Mask, MVT::i64), Dest);
+    }
 
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
@@ -3062,6 +3446,9 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
           AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
     return SDValue();
 
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
   // While there is no integer popcount instruction, it can
   // be more efficiently lowered to the following sequence that uses
   // AdvSIMD registers/instructions as long as the copies to/from
@@ -4013,8 +4400,10 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
       return;
     case 'J': {
       uint64_t NVal = -C->getSExtValue();
-      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
+        CVal = C->getSExtValue();
         break;
+      }
       return;
     }
     // The K and L constraints apply *only* to logical immediates, including
@@ -4138,10 +4527,30 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
-  SmallVector<SDValue, 2> SourceVecs;
-  SmallVector<unsigned, 2> MinElts;
-  SmallVector<unsigned, 2> MaxElts;
+  struct ShuffleSourceInfo {
+    SDValue Vec;
+    unsigned MinElt;
+    unsigned MaxElt;
+
+    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+    // be compatible with the shuffle we intend to construct. As a result
+    // ShuffleVec will be some sliding window into the original Vec.
+    SDValue ShuffleVec;
+
+    // Code should guarantee that element i in Vec starts at element "WindowBase
+    // + i * WindowScale in ShuffleVec".
+    int WindowBase;
+    int WindowScale;
+
+    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+    ShuffleSourceInfo(SDValue Vec)
+        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+          WindowScale(1) {}
+  };
 
+  // First gather all vectors used as an immediate source for this BUILD_VECTOR
+  // node.
+  SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.getOpcode() == ISD::UNDEF)
@@ -4152,133 +4561,155 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
       return SDValue();
     }
 
-    // Record this extraction against the appropriate vector if possible...
+    // Add this element source to the list if it's not already there.
     SDValue SourceVec = V.getOperand(0);
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
-    bool FoundSource = false;
-    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
-      if (SourceVecs[j] == SourceVec) {
-        if (MinElts[j] > EltNo)
-          MinElts[j] = EltNo;
-        if (MaxElts[j] < EltNo)
-          MaxElts[j] = EltNo;
-        FoundSource = true;
-        break;
-      }
-    }
+    auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
+    if (Source == Sources.end())
+      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
-    // Or record a new source if not...
-    if (!FoundSource) {
-      SourceVecs.push_back(SourceVec);
-      MinElts.push_back(EltNo);
-      MaxElts.push_back(EltNo);
-    }
+    // Update the minimum and maximum lane number seen.
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    Source->MinElt = std::min(Source->MinElt, EltNo);
+    Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
 
   // Currently only do something sane when at most two source vectors
-  // involved.
-  if (SourceVecs.size() > 2)
+  // are involved.
+  if (Sources.size() > 2)
     return SDValue();
 
-  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
-  int VEXTOffsets[2] = { 0, 0 };
-  int OffsetMultipliers[2] = { 1, 1 };
-
-  // This loop extracts the usage patterns of the source vectors
-  // and prepares appropriate SDValues for a shuffle if possible.
-  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
-    SDValue CurSource = SourceVecs[i];
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType()) {
-      // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
-      // Then bitcast it to the vector which holds asserted element type,
-      // and record the multiplier of element width between SourceVecs and
-      // Build_vector which is needed to extract the correct lanes later.
-      EVT CastVT =
-          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                           SourceVecs[i].getValueSizeInBits() /
-                               VT.getVectorElementType().getSizeInBits());
-
-      CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
-      OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
-      NumSrcElts *= OffsetMultipliers[i];
-      MaxElts[i] *= OffsetMultipliers[i];
-      MinElts[i] *= OffsetMultipliers[i];
+  // Find out the smallest element size among result and two sources, and use
+  // it as element size to build the shuffle_vector.
+  EVT SmallestEltTy = VT.getVectorElementType();
+  for (auto &Source : Sources) {
+    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+    if (SrcEltTy.bitsLT(SmallestEltTy)) {
+      SmallestEltTy = SrcEltTy;
     }
+  }
+  unsigned ResMultiplier =
+      VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
+  NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
 
-    if (CurSource.getValueType() == VT) {
-      // No VEXT necessary
-      ShuffleSrcs[i] = CurSource;
-      VEXTOffsets[i] = 0;
+  // If the source vector is too wide or too narrow, we may nevertheless be able
+  // to construct a compatible shuffle either by concatenating it with UNDEF or
+  // extracting a suitable range of elements.
+  for (auto &Src : Sources) {
+    EVT SrcVT = Src.ShuffleVec.getValueType();
+
+    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
       continue;
-    } else if (NumSrcElts < NumElts) {
+
+    // This stage of the search produces a source with the same element type as
+    // the original, but with a total width matching the BUILD_VECTOR output.
+    EVT EltVT = SrcVT.getVectorElementType();
+    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
+
+    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+      assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
-      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
-                                   DAG.getUNDEF(CurSource.getValueType()));
+      Src.ShuffleVec =
+          DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));
       continue;
     }
 
-    // Since only 64-bit and 128-bit vectors are legal on ARM and
-    // we've eliminated the other cases...
-    assert(NumSrcElts == 2 * NumElts &&
-           "unexpected vector sizes in ReconstructShuffle");
+    assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
 
-    if (MaxElts[i] - MinElts[i] >= NumElts) {
+    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
       // Span too large for a VEXT to cope
       return SDValue();
     }
 
-    if (MinElts[i] >= NumElts) {
+    if (Src.MinElt >= NumSrcElts) {
       // The extraction can just take the second half
-      VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                   DAG.getIntPtrConstant(NumElts));
-    } else if (MaxElts[i] < NumElts) {
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, MVT::i64));
+      Src.WindowBase = -NumSrcElts;
+    } else if (Src.MaxElt < NumSrcElts) {
       // The extraction can just take the first half
-      VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                   DAG.getIntPtrConstant(0));
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, MVT::i64));
     } else {
       // An actual VEXT is needed
-      VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                     DAG.getIntPtrConstant(0));
-      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                     DAG.getIntPtrConstant(NumElts));
-      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
-      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(Imm, MVT::i32));
+      SDValue VEXTSrc1 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, MVT::i64));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, MVT::i64));
+      unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
+
+      Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
+                                   VEXTSrc2, DAG.getConstant(Imm, MVT::i32));
+      Src.WindowBase = -Src.MinElt;
     }
   }
 
-  SmallVector<int, 8> Mask;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
+  // Another possible incompatibility occurs from the vector element types. We
+  // can fix this by bitcasting the source vectors to the same type we intend
+  // for the shuffle.
+  for (auto &Src : Sources) {
+    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+    if (SrcEltTy == SmallestEltTy)
+      continue;
+    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+    Src.WindowBase *= Src.WindowScale;
+  }
+
+  // Final sanity check before we try to actually produce a shuffle.
+  DEBUG(
+    for (auto Src : Sources)
+      assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+  );
+
+  // The stars all align, our next step is to produce the mask for the shuffle.
+  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+  int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
     SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF) {
-      Mask.push_back(-1);
+    if (Entry.getOpcode() == ISD::UNDEF)
       continue;
-    }
 
-    SDValue ExtractVec = Entry.getOperand(0);
-    int ExtractElt =
-        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
-    if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
-    } else {
-      Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
-                     VEXTOffsets[1]);
-    }
+    auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
+    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+    // segment.
+    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+    int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+                               VT.getVectorElementType().getSizeInBits());
+    int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+    // This source is expected to fill ResMultiplier lanes of the final shuffle,
+    // starting at the appropriate offset.
+    int *LaneMask = &Mask[i * ResMultiplier];
+
+    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+    ExtractBase += NumElts * (Src - Sources.begin());
+    for (int j = 0; j < LanesDefined; ++j)
+      LaneMask[j] = ExtractBase + j;
   }
 
   // Final check before we try to produce nonsense...
-  if (isShuffleMaskLegal(Mask, VT))
-    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
-                                &Mask[0]);
+  if (!isShuffleMaskLegal(Mask, ShuffleVT))
+    return SDValue();
 
-  return SDValue();
+  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+  for (unsigned i = 0; i < Sources.size(); ++i)
+    ShuffleOps[i] = Sources[i].ShuffleVec;
+
+  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+                                         ShuffleOps[1], &Mask[0]);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
 // check if an EXT instruction can handle the shuffle mask when the
@@ -4607,7 +5038,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
         VT.getVectorElementType() == MVT::f32)
       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
     // vrev <4 x i16> -> REV32
-    if (VT.getVectorElementType() == MVT::i16)
+    if (VT.getVectorElementType() == MVT::i16 ||
+        VT.getVectorElementType() == MVT::f16)
       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
     // vrev <4 x i8> -> REV16
     assert(VT.getVectorElementType() == MVT::i8);
@@ -4727,7 +5159,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
 static unsigned getDUPLANEOp(EVT EltType) {
   if (EltType == MVT::i8)
     return AArch64ISD::DUPLANE8;
-  if (EltType == MVT::i16)
+  if (EltType == MVT::i16 || EltType == MVT::f16)
     return AArch64ISD::DUPLANE16;
   if (EltType == MVT::i32 || EltType == MVT::f32)
     return AArch64ISD::DUPLANE32;
@@ -4857,7 +5289,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
 
     EVT ScalarVT = VT.getVectorElementType();
-    if (ScalarVT.getSizeInBits() < 32)
+
+    if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
       ScalarVT = MVT::i32;
 
     return DAG.getNode(
@@ -4945,7 +5378,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -4954,7 +5387,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -4963,7 +5396,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -4972,7 +5405,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -4981,7 +5414,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -4990,7 +5423,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
     }
 
@@ -5145,7 +5578,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5154,7 +5587,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5163,7 +5596,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5172,7 +5605,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5181,7 +5614,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5190,7 +5623,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
     }
 
@@ -5263,13 +5696,13 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         if (VT.getSizeInBits() == 128) {
           SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
                                     DAG.getConstant(CnstVal, MVT::i32));
-          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+          return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
         }
 
         // Support the V64 version via subregister insertion.
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
                                   DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
@@ -5278,7 +5711,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5287,7 +5720,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5296,7 +5729,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5305,7 +5738,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5314,7 +5747,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5323,7 +5756,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5332,7 +5765,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5341,7 +5774,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
@@ -5349,7 +5782,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       // The few faces of FMOV...
@@ -5358,7 +5791,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
@@ -5366,7 +5799,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
                                   DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       // The many faces of MVNI...
@@ -5377,7 +5810,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5386,7 +5819,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5395,7 +5828,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5404,7 +5837,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5413,7 +5846,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5422,7 +5855,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5431,7 +5864,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5440,7 +5873,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
     }
 
@@ -5616,11 +6049,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
     return SDValue();
 
   // For V64 types, we perform insertion by expanding the value
@@ -5649,11 +6083,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
     return SDValue();
 
   // For V64 types, we perform extraction by expanding the value
@@ -6187,7 +6622,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                        Attribute::NoImplicitFloat) &&
       (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+       (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
     return MVT::f128;
 
   return Size >= 8 ? MVT::i64 : MVT::i32;
@@ -6382,6 +6817,48 @@ static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
   return performIntegerAbsCombine(N, DAG);
 }
 
+SDValue
+AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                     SelectionDAG &DAG,
+                                     std::vector<SDNode *> *Created) const {
+  // fold (sdiv X, pow2)
+  EVT VT = N->getValueType(0);
+  if ((VT != MVT::i32 && VT != MVT::i64) ||
+      !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  unsigned Lg2 = Divisor.countTrailingZeros();
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT);
+
+  // Add (N0 < 0) ? Pow2 - 1 : 0;
+  SDValue CCVal;
+  SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+  SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
+
+  if (Created) {
+    Created->push_back(Cmp.getNode());
+    Created->push_back(Add.getNode());
+    Created->push_back(CSel.getNode());
+  }
+
+  // Divide by pow2.
+  SDValue SRA =
+      DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64));
+
+  // If we're dividing by a positive value, we're done.  Otherwise, we must
+  // negate the result.
+  if (Divisor.isNonNegative())
+    return SRA;
+
+  if (Created)
+    Created->push_back(SRA.getNode());
+  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA);
+}
+
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
@@ -6459,14 +6936,14 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
     return SDValue();
 
-  // Now check that the other operand of the AND is a constant splat. We could
+  // Now check that the other operand of the AND is a constant. We could
   // make the transformation for non-constant splats as well, but it's unclear
   // that would be a benefit as it would not eliminate any operations, just
   // perform one more step in scalar code before moving to the vector unit.
   if (BuildVectorSDNode *BV =
           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
-    // Bail out if the vector isn't a constant splat.
-    if (!BV->getConstantSplatNode())
+    // Bail out if the vector isn't a constant.
+    if (!BV->isConstant())
       return SDValue();
 
     // Everything checks out. Build up the new and improved node.
@@ -6486,7 +6963,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
@@ -6505,7 +6983,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
   // This eliminates an "integer-to-vector-move UOP and improve throughput.
   SDValue N0 = N->getOperand(0);
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+  if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -7266,11 +7744,11 @@ static SDValue performExtendCombine(SDNode *N,
   // If the vector type isn't a simple VT, it's beyond the scope of what
   // we're  worried about here. Let legalization do its thing and hope for
   // the best.
-  if (!ResVT.isSimple())
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src->getValueType(0);
+  if (!ResVT.isSimple() || !SrcVT.isSimple())
     return SDValue();
 
-  SDValue Src = N->getOperand(0);
-  MVT SrcVT = Src->getValueType(0).getSimpleVT();
   // If the source VT is a 64-bit vector, we can play games and get the
   // better results we want.
   if (SrcVT.getSizeInBits() != 64)
@@ -7294,9 +7772,9 @@ static SDValue performExtendCombine(SDNode *N,
   EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
                                LoVT.getVectorNumElements());
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(0));
+                   DAG.getConstant(0, MVT::i64));
   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+                   DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64));
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
 
@@ -7418,9 +7896,9 @@ static SDValue performSTORECombine(SDNode *N,
   EVT HalfVT =
       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(0));
+                                   DAG.getConstant(0, MVT::i64));
   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(NumElts));
+                                   DAG.getConstant(NumElts, MVT::i64));
   SDValue BasePtr = S->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
@@ -7504,7 +7982,7 @@ static SDValue performPostLD1Combine(SDNode *N,
     Ops.push_back(Inc);
 
     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    SDVTList SDTys = DAG.getVTList(Tys);
     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
                                            MemVT,
@@ -7634,7 +8112,7 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
       Tys[n] = VecTy;
     Tys[n++] = MVT::i64;  // Type of write back register
     Tys[n] = MVT::Other;  // Type of the chain
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
 
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
@@ -7655,10 +8133,272 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
   return SDValue();
 }
 
+// Checks to see if the value is the prescribed width and returns information
+// about its extension mode.
+static
+bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+  ExtType = ISD::NON_EXTLOAD;
+  switch(V.getNode()->getOpcode()) {
+  default:
+    return false;
+  case ISD::LOAD: {
+    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
+    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
+       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+      ExtType = LoadNode->getExtensionType();
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertSext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::SEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertZext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::ZEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::Constant:
+  case ISD::TargetConstant: {
+    if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+        1LL << (width - 1))
+      return true;
+    return false;
+  }
+  }
+
+  return true;
+}
+
+// This function does a whole lot of voodoo to determine if the tests are
+// equivalent without and with a mask. Essentially what happens is that given a
+// DAG resembling:
+//
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//  |    Input    | | AddConstant | | CompConstant| |     CC      |
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//           |           |           |               |
+//           V           V           |    +----------+
+//          +-------------+  +----+  |    |
+//          |     ADD     |  |0xff|  |    |
+//          +-------------+  +----+  |    |
+//                  |           |    |    |
+//                  V           V    |    |
+//                 +-------------+   |    |
+//                 |     AND     |   |    |
+//                 +-------------+   |    |
+//                      |            |    |
+//                      +-----+      |    |
+//                            |      |    |
+//                            V      V    V
+//                           +-------------+
+//                           |     CMP     |
+//                           +-------------+
+//
+// The AND node may be safely removed for some combinations of inputs. In
+// particular we need to take into account the extension type of the Input,
+// the exact values of AddConstant, CompConstant, and CC, along with the nominal
+// width of the input (this can work for any width inputs, the above graph is
+// specific to 8 bits.
+//
+// The specific equations were worked out by generating output tables for each
+// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
+// problem was simplified by working with 4 bit inputs, which means we only
+// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
+// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
+// patterns present in both extensions (0,7). For every distinct set of
+// AddConstant and CompConstants bit patterns we can consider the masked and
+// unmasked versions to be equivalent if the result of this function is true for
+// all 16 distinct bit patterns of for the current extension type of Input (w0).
+//
+//   sub      w8, w0, w1
+//   and      w10, w8, #0x0f
+//   cmp      w8, w2
+//   cset     w9, AArch64CC
+//   cmp      w10, w2
+//   cset     w11, AArch64CC
+//   cmp      w9, w11
+//   cset     w0, eq
+//   ret
+//
+// Since the above function shows when the outputs are equivalent it defines
+// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
+// would be expensive to run during compiles. The equations below were written
+// in a test harness that confirmed they gave equivalent outputs to the above
+// for all inputs function, so they can be used determine if the removal is
+// legal instead.
+//
+// isEquivalentMaskless() is the code for testing if the AND can be removed
+// factored out of the DAG recognition as the DAG can take several forms.
+
+static
+bool isEquivalentMaskless(unsigned CC, unsigned width,
+                          ISD::LoadExtType ExtType, signed AddConstant,
+                          signed CompConstant) {
+  // By being careful about our equations and only writing the in term
+  // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
+  // make them generally applicable to all bit widths.
+  signed MaxUInt = (1 << width);
+
+  // For the purposes of these comparisons sign extending the type is
+  // equivalent to zero extending the add and displacing it by half the integer
+  // width. Provided we are careful and make sure our equations are valid over
+  // the whole range we can just adjust the input and avoid writing equations
+  // for sign extended inputs.
+  if (ExtType == ISD::SEXTLOAD)
+    AddConstant -= (1 << (width-1));
+
+  switch(CC) {
+  case AArch64CC::LE:
+  case AArch64CC::GT: {
+    if ((AddConstant == 0) ||
+        (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
+        (AddConstant >= 0 && CompConstant < 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LT:
+  case AArch64CC::GE: {
+    if ((AddConstant == 0) ||
+        (AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::HI:
+  case AArch64CC::LS: {
+    if ((AddConstant >= 0 && CompConstant < 0) ||
+       (AddConstant <= 0 && CompConstant >= -1 &&
+        CompConstant < AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::PL:
+  case AArch64CC::MI: {
+    if ((AddConstant == 0) ||
+        (AddConstant > 0 && CompConstant <= 0) ||
+        (AddConstant < 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LO:
+  case AArch64CC::HS: {
+    if ((AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant >= 0 &&
+         CompConstant <= AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::EQ:
+  case AArch64CC::NE: {
+    if ((AddConstant > 0 && CompConstant < 0) ||
+        (AddConstant < 0 && CompConstant >= 0 &&
+         CompConstant < AddConstant + MaxUInt) ||
+        (AddConstant >= 0 && CompConstant >= 0 &&
+         CompConstant >= AddConstant) ||
+        (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
+
+      return true;
+  } break;
+  case AArch64CC::VS:
+  case AArch64CC::VC:
+  case AArch64CC::AL:
+  case AArch64CC::NV:
+    return true;
+  case AArch64CC::Invalid:
+    break;
+  }
+
+  return false;
+}
+
+static
+SDValue performCONDCombine(SDNode *N,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           SelectionDAG &DAG, unsigned CCIndex,
+                           unsigned CmpIndex) {
+  unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
+  SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
+  unsigned CondOpcode = SubsNode->getOpcode();
+
+  if (CondOpcode != AArch64ISD::SUBS)
+    return SDValue();
+
+  // There is a SUBS feeding this condition. Is it fed by a mask we can
+  // use?
+
+  SDNode *AndNode = SubsNode->getOperand(0).getNode();
+  unsigned MaskBits = 0;
+
+  if (AndNode->getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
+    uint32_t CNV = CN->getZExtValue();
+    if (CNV == 255)
+      MaskBits = 8;
+    else if (CNV == 65535)
+      MaskBits = 16;
+  }
+
+  if (!MaskBits)
+    return SDValue();
+
+  SDValue AddValue = AndNode->getOperand(0);
+
+  if (AddValue.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The basic dag structure is correct, grab the inputs and validate them.
+
+  SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
+  SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
+  SDValue SubsInputValue = SubsNode->getOperand(1);
+
+  // The mask is present and the provenance of all the values is a smaller type,
+  // lets see if the mask is superfluous.
+
+  if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
+      !isa<ConstantSDNode>(SubsInputValue.getNode()))
+    return SDValue();
+
+  ISD::LoadExtType ExtType;
+
+  if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+    return SDValue();
+
+  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
+                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+    return SDValue();
+
+  // The AND is not necessary, remove it.
+
+  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
+                               SubsNode->getValueType(1));
+  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+
+  SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
+  DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
+
+  return SDValue(N, 0);
+}
+
 // Optimize compare with zero and branch.
 static SDValue performBRCONDCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
+  SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
+  if (NV.getNode())
+    N = NV.getNode();
   SDValue Chain = N->getOperand(0);
   SDValue Dest = N->getOperand(1);
   SDValue CCVal = N->getOperand(2);
@@ -7747,21 +8487,29 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   EVT ResVT = N->getValueType(0);
 
-  if (!N->getOperand(1).getValueType().isVector())
+  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
     return SDValue();
 
-  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+  // If NumMaskElts == 0, the comparison is larger than select result. The
+  // largest real NEON comparison is 64-bits per lane, which means the result is
+  // at most 32-bits and an illegal vector. Just bail out for now.
+  EVT SrcVT = N0.getOperand(0).getValueType();
+
+  // Don't try to do this optimization when the setcc itself has i1 operands.
+  // There are no legal vectors of i1, so this would be pointless.
+  if (SrcVT == MVT::i1)
     return SDValue();
 
-  SDLoc DL(N0);
+  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
+  if (!ResVT.isVector() || NumMaskElts == 0)
+    return SDValue();
 
-  EVT SrcVT = N0.getOperand(0).getValueType();
-  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
-                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
 
   // First perform a vector comparison, where lane 0 is the one we're interested
   // in.
+  SDLoc DL(N0);
   SDValue LHS =
       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
   SDValue RHS =
@@ -7771,8 +8519,8 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
   // Now duplicate the comparison mask we want across all other lanes.
   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
-  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
-                     Mask);
+  Mask = DAG.getNode(ISD::BITCAST, DL,
+                     ResVT.changeVectorElementTypeToInteger(), Mask);
 
   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
@@ -7792,7 +8540,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performMulCombine(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG);
+    return performIntToFpCombine(N, DAG, Subtarget);
   case ISD::OR:
     return performORCombine(N, DCI, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN:
@@ -7813,6 +8561,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performSTORECombine(N, DCI, DAG, Subtarget);
   case AArch64ISD::BRCOND:
     return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::CSEL:
+    return performCONDCombine(N, DCI, DAG, 2, 3);
   case AArch64ISD::DUP:
     return performPostLD1Combine(N, DCI, false);
   case ISD::INSERT_VECTOR_ELT:
@@ -7968,13 +8718,12 @@ bool AArch64TargetLowering::getPostIndexedAddressParts(
 
 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                   SelectionDAG &DAG) {
-  if (N->getValueType(0) != MVT::i16)
-    return;
-
   SDLoc DL(N);
   SDValue Op = N->getOperand(0);
-  assert(Op.getValueType() == MVT::f16 &&
-         "Inconsistent bitcast? Only 16-bit types should be i16 or f16");
+
+  if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+    return;
+
   Op = SDValue(
       DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
                          DAG.getUNDEF(MVT::i32), Op,
@@ -8000,17 +8749,14 @@ void AArch64TargetLowering::ReplaceNodeResults(
   }
 }
 
-bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
-  // Loads and stores less than 128-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 128;
+bool AArch64TargetLowering::useLoadStackGuardNode() const {
+  return true;
+}
 
-  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
+bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal if there are three or more FDIVs.
+  return NumUsers > 2;
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -8025,12 +8771,37 @@ AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  return Size <= 128;
+}
+
+bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
+  return true;
+}
+
 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                              AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsAcquire = isAtLeastAcquire(Ord);
 
   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i64, i64} and we have to recombine them into a
@@ -8065,8 +8836,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                    Value *Val, Value *Addr,
                                                    AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsRelease = isAtLeastRelease(Ord);
 
   // Since the intrinsics must have legal type, the i128 intrinsics take two
   // parameters: "i64, i64". We must marshal Val into the appropriate form
@@ -8093,3 +8863,8 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                 Val, Stxr->getFunctionType()->getParamType(0)),
       Addr);
 }
+
+bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  return Ty->isArrayTy();
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index cb0b9ef..cc25bed 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H
-#define LLVM_TARGET_AArch64_ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -162,6 +162,16 @@ enum {
   SITOF,
   UITOF,
 
+  /// Natural vector cast. ISD::BITCAST is not natural in the big-endian
+  /// world w.r.t vectors; which causes additional REV instructions to be
+  /// generated to compensate for the byte-swapping. But sometimes we do
+  /// need to re-interpret the data in SIMD vector registers in big-endian
+  /// mode without emitting such REV instructions.
+  NVCAST,
+
+  SMULL,
+  UMULL,
+
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LD3post,
@@ -197,10 +207,9 @@ class AArch64TargetLowering : public TargetLowering {
   bool RequireStrictAlign;
 
 public:
-  explicit AArch64TargetLowering(TargetMachine &TM);
+  explicit AArch64TargetLowering(const TargetMachine &TM);
 
-  /// Selects the correct CCAssignFn for a the given CallingConvention
-  /// value.
+  /// Selects the correct CCAssignFn for a given CallingConvention value.
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 
   /// computeKnownBitsForTargetNode - Determine which of the bits specified in
@@ -212,10 +221,11 @@ public:
 
   MVT getScalarShiftAmountTy(EVT LHSTy) const override;
 
-  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+  /// allowsMisalignedMemoryAccesses - Returns true if the target allows
   /// unaligned memory accesses. of the specified type.
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
-                                     bool *Fast = nullptr) const override {
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                      unsigned Align = 1,
+                                      bool *Fast = nullptr) const override {
     if (RequireStrictAlign)
       return false;
     // FIXME: True for Cyclone, but not necessary others.
@@ -317,13 +327,17 @@ public:
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override;
 
+  bool hasLoadLinkedStoreConditional() const override;
   Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                               Value *Addr, AtomicOrdering Ord) const override;
 
-  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+  bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+  bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+  bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
+  bool useLoadStackGuardNode() const override;
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
@@ -424,6 +438,10 @@ private:
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                        std::vector<SDNode *> *Created) const override;
+  bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+
   ConstraintType
   getConstraintType(const std::string &Constraint) const override;
   unsigned getRegisterByName(const char* RegName, EVT VT) const override;
@@ -455,6 +473,10 @@ private:
 
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
+
+  bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
+                                                 CallingConv::ID CallConv,
+                                                 bool isVarArg) const override;
 };
 
 namespace AArch64 {
@@ -464,4 +486,4 @@ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_AArch64_ISELLOWERING_H
+#endif
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 3b9e3c6..4923a11 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -29,8 +29,7 @@ def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
 class acquiring_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected load ordering");
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+  return isAtLeastAcquire(Ordering);
 }]>;
 
 // An atomic load operation that does not need either acquire or release
@@ -38,7 +37,7 @@ class acquiring_load<PatFrag base>
 class relaxed_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
+  return !isAtLeastAcquire(Ordering);
 }]>;
 
 // 8-bit loads
@@ -114,14 +113,14 @@ class releasing_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
   assert(Ordering != AcquireRelease && "unexpected store ordering");
-  return Ordering == Release || Ordering == SequentiallyConsistent;
+  return isAtLeastRelease(Ordering);
 }]>;
 
 // An atomic store operation that doesn't actually need to be atomic on AArch64.
 class relaxed_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
+  return !isAtLeastRelease(Ordering);
 }]>;
 
 // 8-bit stores
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index e88c0c0..d295c02 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -843,7 +843,7 @@ def MRSSystemRegisterOperand : AsmOperandClass {
   let ParserMethod = "tryParseSysReg";
   let DiagnosticType = "MRS";
 }
-// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate.
 def mrs_sysreg_op : Operand<i32> {
   let ParserMatchClass = MRSSystemRegisterOperand;
   let DecoderMethod = "DecodeMRSSystemRegister";
@@ -863,9 +863,8 @@ def msr_sysreg_op : Operand<i32> {
 
 class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
                        "mrs", "\t$Rt, $systemreg"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
+  bits<16> systemreg;
+  let Inst{20-5} = systemreg;
 }
 
 // FIXME: Some of these def NZCV, others don't. Best way to model that?
@@ -873,9 +872,8 @@ class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
 // would do it, but feels like overkill at this point.
 class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
                        "msr", "\t$systemreg, $Rt"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
+  bits<16> systemreg;
+  let Inst{20-5} = systemreg;
 }
 
 def SystemPStateFieldOperand : AsmOperandClass {
@@ -1351,14 +1349,15 @@ class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
 }
 
 multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  // MADD/MSUB generation is decided by MachineCombiner.cpp
   def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
-      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>,
       Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 0;
   }
 
   def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
-      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>,
       Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 1;
   }
@@ -1636,7 +1635,7 @@ class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
 
 multiclass AddSub<bit isSub, string mnemonic,
                   SDPatternOperator OpNode = null_frag> {
-  let hasSideEffects = 0 in {
+  let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
   // Add/Subtract immediate
   def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
                            mnemonic, OpNode> {
@@ -1961,14 +1960,14 @@ class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
 
 multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
                       string Alias> {
-  let AddedComplexity = 6 in
+  let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
   def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
                            [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
                                                logical_imm32:$imm))]> {
     let Inst{31} = 0;
     let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
   }
-  let AddedComplexity = 6 in
+  let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
   def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
                            [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
                                                logical_imm64:$imm))]> {
@@ -2013,8 +2012,10 @@ class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
 // Split from LogicalImm as not all instructions have both.
 multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
                       SDPatternOperator OpNode> {
+  let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
   def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
   def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+  }
 
   def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
                             [(set GPR32:$Rd, (OpNode GPR32:$Rn,
@@ -2995,7 +2996,7 @@ class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
     : BaseLoadStorePreIdx<sz, V, opc,
                      (outs GPR64sp:$wback, regtype:$Rt),
                      (ins GPR64sp:$Rn, simm9:$offset), asm,
-                     "$Rn = $wback", []>,
+                     "$Rn = $wback,@earlyclobber $wback", []>,
       Sched<[WriteLD, WriteAdr]>;
 
 let mayStore = 1, mayLoad = 0 in
@@ -3004,7 +3005,7 @@ class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
     : BaseLoadStorePreIdx<sz, V, opc,
                       (outs GPR64sp:$wback),
                       (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                      asm, "$Rn = $wback",
+                      asm, "$Rn = $wback,@earlyclobber $wback",
       [(set GPR64sp:$wback,
             (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
       Sched<[WriteAdr, WriteST]>;
@@ -3014,7 +3015,6 @@ class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
 // Load/store post-indexed
 //---
 
-// (pre-index) load/stores.
 class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
                           string asm, string cstr, list<dag> pat>
     : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
@@ -3042,7 +3042,7 @@ class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
     : BaseLoadStorePostIdx<sz, V, opc,
                       (outs GPR64sp:$wback, regtype:$Rt),
                       (ins GPR64sp:$Rn, simm9:$offset),
-                      asm, "$Rn = $wback", []>,
+                      asm, "$Rn = $wback,@earlyclobber $wback", []>,
       Sched<[WriteLD, WriteI]>;
 
 let mayStore = 1, mayLoad = 0 in
@@ -3051,7 +3051,7 @@ class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
     : BaseLoadStorePostIdx<sz, V, opc,
                       (outs GPR64sp:$wback),
                       (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                       asm, "$Rn = $wback",
+                       asm, "$Rn = $wback,@earlyclobber $wback",
       [(set GPR64sp:$wback,
             (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
     Sched<[WriteAdr, WriteST, ReadAdrBase]>;
@@ -3115,7 +3115,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
 // (pre-indexed)
 class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
                               string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> {
   bits<5> Rt;
   bits<5> Rt2;
   bits<5> Rn;
@@ -3156,7 +3156,7 @@ class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
 
 class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
                               string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> {
   bits<5> Rt;
   bits<5> Rt2;
   bits<5> Rn;
@@ -4383,7 +4383,7 @@ class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
 }
 
 multiclass SIMDVectorLShiftLongBySizeBHS {
-  let neverHasSideEffects = 1 in {
+  let hasSideEffects = 0 in {
   def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
                                              "shll", ".8h",  ".8b", "8">;
   def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
@@ -5260,6 +5260,10 @@ multiclass SIMDZipVector<bits<3>opc, string asm,
   def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
       asm, ".2d", OpNode, v2i64>;
 
+  def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
   def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
         (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
   def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index ce85b2c..e582ed4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineCombinerPattern.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -260,8 +261,9 @@ void AArch64InstrInfo::instantiateCondBranch(
     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
   } else {
     // Folded compare-and-branch
+    // Note that we use addOperand instead of addReg to keep the flags.
     const MachineInstrBuilder MIB =
-        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]);
     if (Cond.size() > 3)
       MIB.addImm(Cond[3].getImm());
     MIB.addMBB(TBB);
@@ -541,6 +543,51 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
       CC);
 }
 
+// FIXME: this implementation should be micro-architecture dependent, so a
+// micro-architecture target hook should be introduced here in future.
+bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
+  if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53())
+    return MI->isAsCheapAsAMove();
+
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+
+  // add/sub on register without shift
+  case AArch64::ADDWri:
+  case AArch64::ADDXri:
+  case AArch64::SUBWri:
+  case AArch64::SUBXri:
+    return (MI->getOperand(3).getImm() == 0);
+
+  // logical ops on immediate
+  case AArch64::ANDWri:
+  case AArch64::ANDXri:
+  case AArch64::EORWri:
+  case AArch64::EORXri:
+  case AArch64::ORRWri:
+  case AArch64::ORRXri:
+    return true;
+
+  // logical ops on register without shift
+  case AArch64::ANDWrr:
+  case AArch64::ANDXrr:
+  case AArch64::BICWrr:
+  case AArch64::BICXrr:
+  case AArch64::EONWrr:
+  case AArch64::EONXrr:
+  case AArch64::EORWrr:
+  case AArch64::EORXrr:
+  case AArch64::ORNWrr:
+  case AArch64::ORNXrr:
+  case AArch64::ORRWrr:
+  case AArch64::ORRXrr:
+    return true;
+  }
+
+  llvm_unreachable("Unknown opcode to check as cheap as a move!");
+}
+
 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                              unsigned &SrcReg, unsigned &DstReg,
                                              unsigned &SubIdx) const {
@@ -561,6 +608,42 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   }
 }
 
+bool
+AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
+                                                  MachineInstr *MIb,
+                                                  AliasAnalysis *AA) const {
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned BaseRegA = 0, BaseRegB = 0;
+  int OffsetA = 0, OffsetB = 0;
+  int WidthA = 0, WidthB = 0;
+
+  assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
+         "MIa must be a store or a load");
+  assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
+         "MIb must be a store or a load");
+
+  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
+      MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+    return false;
+
+  // Retrieve the base register, offset from the base register and width. Width
+  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
+  // base registers are identical, and the offset of a lower memory access +
+  // the width doesn't overlap the offset of a higher memory access,
+  // then the memory accesses are different.
+  if (getLdStBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
+      getLdStBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
+    if (BaseRegA == BaseRegB) {
+      int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+      int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+      int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+      if (LowOffset + LowWidth <= HighOffset)
+        return true;
+    }
+  }
+  return false;
+}
+
 /// analyzeCompare - For a comparison instruction, return the source registers
 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
 /// Return true if the comparison instruction can be analyzed.
@@ -595,7 +678,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
     SrcReg = MI->getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    CmpValue = MI->getOperand(2).getImm();
+    // FIXME: In order to convert CmpValue to 0 or 1
+    CmpValue = (MI->getOperand(2).getImm() != 0);
     return true;
   case AArch64::ANDSWri:
   case AArch64::ANDSXri:
@@ -604,9 +688,14 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
     SrcReg = MI->getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    CmpValue = AArch64_AM::decodeLogicalImmediate(
-        MI->getOperand(2).getImm(),
-        MI->getOpcode() == AArch64::ANDSWri ? 32 : 64);
+    // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
+    // while the type of CmpValue is int. When converting uint64_t to int,
+    // the high 32 bits of uint64_t will be lost.
+    // In fact it causes a bug in spec2006-483.xalancbmk
+    // CmpValue is only used to compare with zero in OptimizeCompareInstr
+    CmpValue = (AArch64_AM::decodeLogicalImmediate(
+                    MI->getOperand(2).getImm(),
+                    MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0);
     return true;
   }
 
@@ -619,8 +708,8 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
   MachineFunction *MF = MBB->getParent();
   assert(MF && "Can't get MachineFunction here");
   const TargetMachine *TM = &MF->getTarget();
-  const TargetInstrInfo *TII = TM->getInstrInfo();
-  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+  const TargetInstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+  const TargetRegisterInfo *TRI = TM->getSubtargetImpl()->getRegisterInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
 
   for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
@@ -652,6 +741,87 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
   return true;
 }
 
+/// \brief Return the opcode that does not set flags when possible - otherwise
+/// return the original opcode. The caller is responsible to do the actual
+/// substitution and legality checking.
+static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
+  // Don't convert all compare instructions, because for some the zero register
+  // encoding becomes the sp register.
+  bool MIDefinesZeroReg = false;
+  if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR))
+    MIDefinesZeroReg = true;
+
+  switch (MI->getOpcode()) {
+  default:
+    return MI->getOpcode();
+  case AArch64::ADDSWrr:
+    return AArch64::ADDWrr;
+  case AArch64::ADDSWri:
+    return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
+  case AArch64::ADDSWrs:
+    return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
+  case AArch64::ADDSWrx:
+    return AArch64::ADDWrx;
+  case AArch64::ADDSXrr:
+    return AArch64::ADDXrr;
+  case AArch64::ADDSXri:
+    return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
+  case AArch64::ADDSXrs:
+    return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
+  case AArch64::ADDSXrx:
+    return AArch64::ADDXrx;
+  case AArch64::SUBSWrr:
+    return AArch64::SUBWrr;
+  case AArch64::SUBSWri:
+    return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
+  case AArch64::SUBSWrs:
+    return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
+  case AArch64::SUBSWrx:
+    return AArch64::SUBWrx;
+  case AArch64::SUBSXrr:
+    return AArch64::SUBXrr;
+  case AArch64::SUBSXri:
+    return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
+  case AArch64::SUBSXrs:
+    return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
+  case AArch64::SUBSXrx:
+    return AArch64::SUBXrx;
+  }
+}
+
+/// True when condition code could be modified on the instruction
+/// trace starting at from and ending at to.
+static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To,
+                                  const bool CheckOnlyCCWrites,
+                                  const TargetRegisterInfo *TRI) {
+  // We iterate backward starting \p To until we hit \p From
+  MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin();
+
+  // Early exit if To is at the beginning of the BB.
+  if (I == B)
+    return true;
+
+  // Check whether the definition of SrcReg is in the same basic block as
+  // Compare. If not, assume the condition code gets modified on some path.
+  if (To->getParent() != From->getParent())
+    return true;
+
+  // Check that NZCV isn't set on the trace.
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
+        (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI)))
+      // This instruction modifies or uses NZCV after the one we want to
+      // change.
+      return true;
+    if (I == B)
+      // We currently don't allow the instruction trace to cross basic
+      // block boundaries
+      return true;
+  }
+  return false;
+}
 /// optimizeCompareInstr - Convert the instruction supplying the argument to the
 /// comparison into one that sets the zero bit in the flags register.
 bool AArch64InstrInfo::optimizeCompareInstr(
@@ -661,28 +831,15 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   // Replace SUBSWrr with SUBWrr if NZCV is not used.
   int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
   if (Cmp_NZCV != -1) {
-    unsigned NewOpc;
-    switch (CmpInstr->getOpcode()) {
-    default:
-      return false;
-    case AArch64::ADDSWrr:      NewOpc = AArch64::ADDWrr; break;
-    case AArch64::ADDSWri:      NewOpc = AArch64::ADDWri; break;
-    case AArch64::ADDSWrs:      NewOpc = AArch64::ADDWrs; break;
-    case AArch64::ADDSWrx:      NewOpc = AArch64::ADDWrx; break;
-    case AArch64::ADDSXrr:      NewOpc = AArch64::ADDXrr; break;
-    case AArch64::ADDSXri:      NewOpc = AArch64::ADDXri; break;
-    case AArch64::ADDSXrs:      NewOpc = AArch64::ADDXrs; break;
-    case AArch64::ADDSXrx:      NewOpc = AArch64::ADDXrx; break;
-    case AArch64::SUBSWrr:      NewOpc = AArch64::SUBWrr; break;
-    case AArch64::SUBSWri:      NewOpc = AArch64::SUBWri; break;
-    case AArch64::SUBSWrs:      NewOpc = AArch64::SUBWrs; break;
-    case AArch64::SUBSWrx:      NewOpc = AArch64::SUBWrx; break;
-    case AArch64::SUBSXrr:      NewOpc = AArch64::SUBXrr; break;
-    case AArch64::SUBSXri:      NewOpc = AArch64::SUBXri; break;
-    case AArch64::SUBSXrs:      NewOpc = AArch64::SUBXrs; break;
-    case AArch64::SUBSXrx:      NewOpc = AArch64::SUBXrx; break;
+    if (CmpInstr->definesRegister(AArch64::WZR) ||
+        CmpInstr->definesRegister(AArch64::XZR)) {
+      CmpInstr->eraseFromParent();
+      return true;
     }
-
+    unsigned Opc = CmpInstr->getOpcode();
+    unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+    if (NewOpc == Opc)
+      return false;
     const MCInstrDesc &MCID = get(NewOpc);
     CmpInstr->setDesc(MCID);
     CmpInstr->RemoveOperand(Cmp_NZCV);
@@ -693,6 +850,9 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   }
 
   // Continue only if we have a "ri" where immediate is zero.
+  // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
+  // function.
+  assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
   if (CmpValue != 0 || SrcReg2 != 0)
     return false;
 
@@ -705,36 +865,10 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   if (!MI)
     return false;
 
-  // We iterate backward, starting from the instruction before CmpInstr and
-  // stop when reaching the definition of the source register or done with the
-  // basic block, to check whether NZCV is used or modified in between.
-  MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
-
-  // Early exit if CmpInstr is at the beginning of the BB.
-  if (I == B)
-    return false;
-
-  // Check whether the definition of SrcReg is in the same basic block as
-  // Compare. If not, we can't optimize away the Compare.
-  if (MI->getParent() != CmpInstr->getParent())
-    return false;
-
-  // Check that NZCV isn't set between the comparison instruction and the one we
-  // want to change.
+  bool CheckOnlyCCWrites = false;
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  for (--I; I != E; --I) {
-    const MachineInstr &Instr = *I;
-
-    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
-        Instr.readsRegister(AArch64::NZCV, TRI))
-      // This instruction modifies or uses NZCV after the one we want to
-      // change. We can't do this transformation.
-      return false;
-    if (I == B)
-      // The 'and' is below the comparison instruction.
-      return false;
-  }
+  if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI))
+    return false;
 
   unsigned NewOpc = MI->getOpcode();
   switch (MI->getOpcode()) {
@@ -848,6 +982,56 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   return true;
 }
 
+bool
+AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+    return false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Reg = MI->getOperand(0).getReg();
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+  const TargetMachine &TM = MBB.getParent()->getTarget();
+  unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
+  const unsigned char MO_NC = AArch64II::MO_NC;
+
+  if ((OpFlags & AArch64II::MO_GOT) != 0) {
+    BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT);
+    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+        .addReg(Reg, RegState::Kill).addImm(0)
+        .addMemOperand(*MI->memoperands_begin());
+  } else if (TM.getCodeModel() == CodeModel::Large) {
+    BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
+    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
+    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
+    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
+    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+        .addReg(Reg, RegState::Kill).addImm(0)
+        .addMemOperand(*MI->memoperands_begin());
+  } else {
+    BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
+        .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
+    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, LoFlags)
+        .addMemOperand(*MI->memoperands_begin());
+  }
+
+  MBB.erase(MI);
+
+  return true;
+}
+
 /// Return true if this is this instruction has a non-zero immediate
 bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
@@ -963,12 +1147,14 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
              MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
       return true;
     }
+    break;
   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
     if (MI->getOperand(2).getImm() == 0) {
       assert(MI->getDesc().getNumOperands() == 4 &&
              MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
       return true;
     }
+    break;
   }
   return false;
 }
@@ -991,6 +1177,7 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
              "invalid ORRv16i8 operands");
       return true;
     }
+    break;
   }
   return false;
 }
@@ -1152,6 +1339,102 @@ AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
   };
 }
 
+bool AArch64InstrInfo::getLdStBaseRegImmOfsWidth(
+    MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width,
+    const TargetRegisterInfo *TRI) const {
+  // Handle only loads/stores with base register followed by immediate offset.
+  if (LdSt->getNumOperands() != 3)
+    return false;
+  if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+    return false;
+
+  // Offset is calculated as the immediate operand multiplied by the scaling factor.
+  // Unscaled instructions have scaling factor set to 1.
+  int Scale = 0;
+  switch (LdSt->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+    Width = 16;
+    Scale = 1;
+    break;
+  case AArch64::LDURXi:
+  case AArch64::LDURDi:
+  case AArch64::STURXi:
+  case AArch64::STURDi:
+    Width = 8;
+    Scale = 1;
+    break;
+  case AArch64::LDURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURSWi:
+  case AArch64::STURWi:
+  case AArch64::STURSi:
+    Width = 4;
+    Scale = 1;
+    break;
+  case AArch64::LDURHi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSHWi:
+  case AArch64::STURHi:
+  case AArch64::STURHHi:
+    Width = 2;
+    Scale = 1;
+    break;
+  case AArch64::LDURBi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSBWi:
+  case AArch64::STURBi:
+  case AArch64::STURBBi:
+    Width = 1;
+    Scale = 1;
+    break;
+  case AArch64::LDRXui:
+  case AArch64::STRXui:
+    Scale = Width = 8;
+    break;
+  case AArch64::LDRWui:
+  case AArch64::STRWui:
+    Scale = Width = 4;
+    break;
+  case AArch64::LDRBui:
+  case AArch64::STRBui:
+    Scale = Width = 1;
+    break;
+  case AArch64::LDRHui:
+  case AArch64::STRHui:
+    Scale = Width = 2;
+    break;
+  case AArch64::LDRSui:
+  case AArch64::STRSui:
+    Scale = Width = 4;
+    break;
+  case AArch64::LDRDui:
+  case AArch64::STRDui:
+    Scale = Width = 8;
+    break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    Scale = Width = 16;
+    break;
+  case AArch64::LDRBBui:
+  case AArch64::STRBBui:
+    Scale = Width = 1;
+    break;
+  case AArch64::LDRHHui:
+  case AArch64::STRHHui:
+    Scale = Width = 2;
+    break;
+  };
+
+  BaseReg = LdSt->getOperand(1).getReg();
+  Offset = LdSt->getOperand(2).getImm() * Scale;
+  return true;
+}
+
 /// Detect opportunities for ldp/stp formation.
 ///
 /// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
@@ -1194,16 +1477,15 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
   }
 }
 
-MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                                         int FrameIx,
-                                                         uint64_t Offset,
-                                                         const MDNode *MDPtr,
-                                                         DebugLoc DL) const {
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
+    MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
+    const MDNode *Expr, DebugLoc DL) const {
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
                                 .addFrameIndex(FrameIx)
                                 .addImm(0)
                                 .addImm(Offset)
-                                .addMetadata(MDPtr);
+                                .addMetadata(Var)
+                                .addMetadata(Expr);
   return &*MIB;
 }
 
@@ -2087,3 +2369,592 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(AArch64::HINT);
   NopInst.addOperand(MCOperand::CreateImm(0));
 }
+/// useMachineCombiner - return true when a target supports MachineCombiner
+bool AArch64InstrInfo::useMachineCombiner() const {
+  // AArch64 supports the combiner
+  return true;
+}
+//
+// True when Opc sets flag
+static bool isCombineInstrSettingFlag(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSXrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// 32b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate32(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDWrr:
+  case AArch64::ADDWri:
+  case AArch64::SUBWrr:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSWrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBWri:
+  case AArch64::SUBSWri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// 64b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate64(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDXrr:
+  case AArch64::ADDXri:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSXrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBXri:
+  case AArch64::SUBSXri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate(unsigned Opc) {
+  return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
+}
+
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                              unsigned MulOpc, unsigned ZeroReg) {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineInstr *MI = nullptr;
+  // We need a virtual register definition.
+  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    MI = MRI.getUniqueVRegDef(MO.getReg());
+  // And it needs to be in the trace (otherwise, it won't have a depth).
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
+    return false;
+
+  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+         MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+         MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+
+  // The third input reg must be zero.
+  if (MI->getOperand(3).getReg() != ZeroReg)
+    return false;
+
+  // Must only used by the user we combine with.
+  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+    return false;
+
+  return true;
+}
+
+/// hasPattern - return true when there is potentially a faster code sequence
+/// for an instruction chain ending in \p Root. All potential patterns are
+/// listed
+/// in the \p Pattern vector. Pattern should be sorted in priority order since
+/// the pattern evaluator stops checking as soon as it finds a faster sequence.
+
+bool AArch64InstrInfo::hasPattern(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const {
+  unsigned Opc = Root.getOpcode();
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  if (!isCombineInstrCandidate(Opc))
+    return 0;
+  if (isCombineInstrSettingFlag(Opc)) {
+    int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
+    // When NZCV is live bail out.
+    if (Cmp_NZCV == -1)
+      return 0;
+    unsigned NewOpc = convertFlagSettingOpcode(&Root);
+    // When opcode can't change bail out.
+    // CHECKME: do we miss any cases for opcode conversion?
+    if (NewOpc == Opc)
+      return 0;
+    Opc = NewOpc;
+  }
+
+  switch (Opc) {
+  default:
+    break;
+  case AArch64::ADDWrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "ADDWrr does not have register operands");
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDXrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBWrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBXrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDWri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDWI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDXri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDXI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBWri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBXri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1);
+      Found = true;
+    }
+    break;
+  }
+  return Found;
+}
+
+/// genMadd - Generate madd instruction and combine mul and add.
+/// Example:
+///  MUL I=A,B,0
+///  ADD R,I,C
+///  ==> MADD R,A,B,C
+/// \param Root is the ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
+                             const TargetInstrInfo *TII, MachineInstr &Root,
+                             SmallVectorImpl<MachineInstr *> &InsInstrs,
+                             unsigned IdxMulOpd, unsigned MaddOpc,
+                             const TargetRegisterClass *RC) {
+  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
+  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+  unsigned ResultReg = Root.getOperand(0).getReg();
+  unsigned SrcReg0 = MUL->getOperand(1).getReg();
+  bool Src0IsKill = MUL->getOperand(1).isKill();
+  unsigned SrcReg1 = MUL->getOperand(2).getReg();
+  bool Src1IsKill = MUL->getOperand(2).isKill();
+  unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
+  bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
+
+  if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+    MRI.constrainRegClass(ResultReg, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+    MRI.constrainRegClass(SrcReg0, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+    MRI.constrainRegClass(SrcReg1, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
+    MRI.constrainRegClass(SrcReg2, RC);
+
+  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
+                                    ResultReg)
+                                .addReg(SrcReg0, getKillRegState(Src0IsKill))
+                                .addReg(SrcReg1, getKillRegState(Src1IsKill))
+                                .addReg(SrcReg2, getKillRegState(Src2IsKill));
+  // Insert the MADD
+  InsInstrs.push_back(MIB);
+  return MUL;
+}
+
+/// genMaddR - Generate madd instruction and combine mul and add using
+/// an extra virtual register
+/// Example - an ADD intermediate needs to be stored in a register:
+///   MUL I=A,B,0
+///   ADD R,I,Imm
+///   ==> ORR  V, ZR, Imm
+///   ==> MADD R,A,B,V
+/// \param Root is the ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+/// \param VR is a virtual register that holds the value of an ADD operand
+/// (V in the example above).
+static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
+                              const TargetInstrInfo *TII, MachineInstr &Root,
+                              SmallVectorImpl<MachineInstr *> &InsInstrs,
+                              unsigned IdxMulOpd, unsigned MaddOpc,
+                              unsigned VR, const TargetRegisterClass *RC) {
+  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+  unsigned ResultReg = Root.getOperand(0).getReg();
+  unsigned SrcReg0 = MUL->getOperand(1).getReg();
+  bool Src0IsKill = MUL->getOperand(1).isKill();
+  unsigned SrcReg1 = MUL->getOperand(2).getReg();
+  bool Src1IsKill = MUL->getOperand(2).isKill();
+
+  if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+    MRI.constrainRegClass(ResultReg, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+    MRI.constrainRegClass(SrcReg0, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+    MRI.constrainRegClass(SrcReg1, RC);
+  if (TargetRegisterInfo::isVirtualRegister(VR))
+    MRI.constrainRegClass(VR, RC);
+
+  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
+                                    ResultReg)
+                                .addReg(SrcReg0, getKillRegState(Src0IsKill))
+                                .addReg(SrcReg1, getKillRegState(Src1IsKill))
+                                .addReg(VR);
+  // Insert the MADD
+  InsInstrs.push_back(MIB);
+  return MUL;
+}
+
+/// genAlternativeCodeSequence - when hasPattern() finds a pattern
+/// this function generates the instructions that could replace the
+/// original code sequence
+void AArch64InstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  MachineBasicBlock &MBB = *Root.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  MachineInstr *MUL;
+  const TargetRegisterClass *RC;
+  unsigned Opc;
+  switch (Pattern) {
+  default:
+    // signal error.
+    break;
+  case MachineCombinerPattern::MC_MULADDW_OP1:
+  case MachineCombinerPattern::MC_MULADDX_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,C
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) {
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MC_MULADDW_OP2:
+  case MachineCombinerPattern::MC_MULADDX_OP2:
+    // MUL I=A,B,0
+    // ADD R,C,I
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) {
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MC_MULADDWI_OP1:
+  case MachineCombinerPattern::MC_MULADDXI_OP1: {
+    // MUL I=A,B,0
+    // ADD R,I,Imm
+    // ==> ORR  V, ZR, Imm
+    // ==> MADD R,A,B,V
+    // --- Create(MADD);
+    const TargetRegisterClass *OrrRC;
+    unsigned BitSize, OrrOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) {
+      OrrOpc = AArch64::ORRWri;
+      OrrRC = &AArch64::GPR32spRegClass;
+      BitSize = 32;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      OrrOpc = AArch64::ORRXri;
+      OrrRC = &AArch64::GPR64spRegClass;
+      BitSize = 64;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+    uint64_t Imm = Root.getOperand(2).getImm();
+
+    if (Root.getOperand(3).isImm()) {
+      unsigned Val = Root.getOperand(3).getImm();
+      Imm = Imm << Val;
+    }
+    uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+    uint64_t Encoding;
+    if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+      MachineInstrBuilder MIB1 =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+              .addReg(ZeroReg)
+              .addImm(Encoding);
+      InsInstrs.push_back(MIB1);
+      InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+      MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+    }
+    break;
+  }
+  case MachineCombinerPattern::MC_MULSUBW_OP1:
+  case MachineCombinerPattern::MC_MULSUBX_OP1: {
+    // MUL I=A,B,0
+    // SUB R,I, C
+    // ==> SUB  V, 0, C
+    // ==> MADD R,A,B,V // = -C + A*B
+    // --- Create(MADD);
+    const TargetRegisterClass *SubRC;
+    unsigned SubOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) {
+      SubOpc = AArch64::SUBWrr;
+      SubRC = &AArch64::GPR32spRegClass;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      SubOpc = AArch64::SUBXrr;
+      SubRC = &AArch64::GPR64spRegClass;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    unsigned NewVR = MRI.createVirtualRegister(SubRC);
+    // SUB NewVR, 0, C
+    MachineInstrBuilder MIB1 =
+        BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
+            .addReg(ZeroReg)
+            .addOperand(Root.getOperand(2));
+    InsInstrs.push_back(MIB1);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+    break;
+  }
+  case MachineCombinerPattern::MC_MULSUBW_OP2:
+  case MachineCombinerPattern::MC_MULSUBX_OP2:
+    // MUL I=A,B,0
+    // SUB R,C,I
+    // ==> MSUB R,A,B,C (computes C - A*B)
+    // --- Create(MSUB);
+    if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) {
+      Opc = AArch64::MSUBWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      Opc = AArch64::MSUBXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MC_MULSUBWI_OP1:
+  case MachineCombinerPattern::MC_MULSUBXI_OP1: {
+    // MUL I=A,B,0
+    // SUB R,I, Imm
+    // ==> ORR  V, ZR, -Imm
+    // ==> MADD R,A,B,V // = -Imm + A*B
+    // --- Create(MADD);
+    const TargetRegisterClass *OrrRC;
+    unsigned BitSize, OrrOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) {
+      OrrOpc = AArch64::ORRWri;
+      OrrRC = &AArch64::GPR32spRegClass;
+      BitSize = 32;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      OrrOpc = AArch64::ORRXri;
+      OrrRC = &AArch64::GPR64spRegClass;
+      BitSize = 64;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+    int Imm = Root.getOperand(2).getImm();
+    if (Root.getOperand(3).isImm()) {
+      unsigned Val = Root.getOperand(3).getImm();
+      Imm = Imm << Val;
+    }
+    uint64_t UImm = -Imm << (64 - BitSize) >> (64 - BitSize);
+    uint64_t Encoding;
+    if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+      MachineInstrBuilder MIB1 =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+              .addReg(ZeroReg)
+              .addImm(Encoding);
+      InsInstrs.push_back(MIB1);
+      InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+      MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+    }
+    break;
+  }
+  } // end switch (Pattern)
+  // Record MUL and ADD/SUB for deletion
+  DelInstrs.push_back(MUL);
+  DelInstrs.push_back(&Root);
+
+  return;
+}
+
+/// \brief Replace csincr-branch sequence by simple conditional branch
+///
+/// Examples:
+/// 1.
+///   csinc  w9, wzr, wzr, <condition code>
+///   tbnz   w9, #0, 0x44
+/// to
+///   b.<inverted condition code>
+///
+/// 2.
+///   csinc w9, wzr, wzr, <condition code>
+///   tbz   w9, #0, 0x44
+/// to
+///   b.<condition code>
+///
+/// \param  MI Conditional Branch
+/// \return True when the simple conditional branch is generated
+///
+bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
+  bool IsNegativeBranch = false;
+  bool IsTestAndBranch = false;
+  unsigned TargetBBInMI = 0;
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown branch instruction?");
+  case AArch64::Bcc:
+    return false;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    TargetBBInMI = 1;
+    break;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    TargetBBInMI = 1;
+    IsNegativeBranch = true;
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+    TargetBBInMI = 2;
+    IsTestAndBranch = true;
+    break;
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    TargetBBInMI = 2;
+    IsNegativeBranch = true;
+    IsTestAndBranch = true;
+    break;
+  }
+  // So we increment a zero register and test for bits other
+  // than bit 0? Conservatively bail out in case the verifier
+  // missed this case.
+  if (IsTestAndBranch && MI->getOperand(1).getImm())
+    return false;
+
+  // Find Definition.
+  assert(MI->getParent() && "Incomplete machine instruciton\n");
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  unsigned VReg = MI->getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return false;
+
+  MachineInstr *DefMI = MRI->getVRegDef(VReg);
+
+  // Look for CSINC
+  if (!(DefMI->getOpcode() == AArch64::CSINCWr &&
+        DefMI->getOperand(1).getReg() == AArch64::WZR &&
+        DefMI->getOperand(2).getReg() == AArch64::WZR) &&
+      !(DefMI->getOpcode() == AArch64::CSINCXr &&
+        DefMI->getOperand(1).getReg() == AArch64::XZR &&
+        DefMI->getOperand(2).getReg() == AArch64::XZR))
+    return false;
+
+  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+    return false;
+
+  AArch64CC::CondCode CC =
+      (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
+  bool CheckOnlyCCWrites = true;
+  // Convert only when the condition code is not modified between
+  // the CSINC and the branch. The CC may be used by other
+  // instructions in between.
+  if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo()))
+    return false;
+  MachineBasicBlock &RefToMBB = *MBB;
+  MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB();
+  DebugLoc DL = MI->getDebugLoc();
+  if (IsNegativeBranch)
+    CC = AArch64CC::getInvertedCondCode(CC);
+  BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
+  MI->eraseFromParent();
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index f70b82b..d8f1274 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -11,11 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64INSTRINFO_H
-#define LLVM_TARGET_AArch64INSTRINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
 
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
@@ -46,9 +47,15 @@ public:
 
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 
+  bool isAsCheapAsAMove(const MachineInstr *MI) const override;
+
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
                              unsigned &DstReg, unsigned &SubIdx) const override;
 
+  bool
+  areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
+                                  AliasAnalysis *AA = nullptr) const override;
+
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                int &FrameIndex) const override;
   unsigned isStoreToStackSlot(const MachineInstr *MI,
@@ -87,6 +94,10 @@ public:
                             unsigned &Offset,
                             const TargetRegisterInfo *TRI) const override;
 
+  bool getLdStBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg,
+                                 int &Offset, int &Width,
+                                 const TargetRegisterInfo *TRI) const;
+
   bool enableClusterLoads() const override { return true; }
 
   bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
@@ -96,8 +107,8 @@ public:
                               MachineInstr *Second) const override;
 
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                         uint64_t Offset, const MDNode *MDPtr,
-                                         DebugLoc DL) const;
+                                         uint64_t Offset, const MDNode *Var,
+                                         const MDNode *Expr, DebugLoc DL) const;
   void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                         DebugLoc DL, unsigned DestReg, unsigned SrcReg,
                         bool KillSrc, unsigned Opcode,
@@ -117,6 +128,7 @@ public:
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  using TargetInstrInfo::foldMemoryOperandImpl;
   MachineInstr *
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                         const SmallVectorImpl<unsigned> &Ops,
@@ -153,7 +165,27 @@ public:
   bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
-
+  bool optimizeCondBranch(MachineInstr *MI) const override;
+  /// hasPattern - return true when there is potentially a faster code sequence
+  /// for an instruction chain ending in <Root>. All potential patterns are
+  /// listed
+  /// in the <Pattern> array.
+  bool hasPattern(MachineInstr &Root,
+                  SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern)
+      const override;
+
+  /// genAlternativeCodeSequence - when hasPattern() finds a pattern
+  /// this function generates the instructions that could replace the
+  /// original code sequence
+  void genAlternativeCodeSequence(
+      MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
+      SmallVectorImpl<MachineInstr *> &InsInstrs,
+      SmallVectorImpl<MachineInstr *> &DelInstrs,
+      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+  /// useMachineCombiner - AArch64 supports MachineCombiner
+  bool useMachineCombiner() const override;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 private:
   void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
                              MachineBasicBlock *TBB,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 0ba069e..e0fb90a 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -24,6 +24,7 @@ def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
+def IsCyclone        : Predicate<"Subtarget->isCyclone()">;
 
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
@@ -236,6 +237,12 @@ def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
 def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
                                  SDT_AArch64WrapperLarge>;
 
+def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
+
+def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                    SDTCisSameAs<1, 2>]>;
+def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
+def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
 
 //===----------------------------------------------------------------------===//
 
@@ -474,6 +481,24 @@ def trunc_imm : SDNodeXForm<imm, [{
 def : Pat<(i64 i64imm_32bit:$src),
           (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
 
+// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32);
+}]>;
+
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64);
+}]>;
+
+
+def : Pat<(f32 fpimm:$in),
+  (COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>;
+def : Pat<(f64 fpimm:$in),
+  (COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>;
+
+
 // Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
 // sequences.
 def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
@@ -632,6 +657,10 @@ def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
           (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
 def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
           (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
 } // AddedComplexity = 7
 
 let AddedComplexity = 5 in {
@@ -782,7 +811,7 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
 //===----------------------------------------------------------------------===//
 // Bitfield immediate extraction instruction.
 //===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
 defm EXTR : ExtractImm<"extr">;
 def : InstAlias<"ror $dst, $src, $shift",
             (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
@@ -797,7 +826,7 @@ def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
 //===----------------------------------------------------------------------===//
 // Other bitfield immediate instructions.
 //===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
 defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
 defm SBFM : BitfieldImm<0b00, "sbfm">;
 defm UBFM : BitfieldImm<0b10, "ubfm">;
@@ -970,9 +999,9 @@ def : InstAlias<"cneg $dst, $src, $cc",
 // PC-relative instructions.
 //===----------------------------------------------------------------------===//
 let isReMaterializable = 1 in {
-let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
 def ADR  : ADRI<0, "adr", adrlabel, []>;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
 
 def ADRP : ADRI<1, "adrp", adrplabel,
                 [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
@@ -1173,6 +1202,9 @@ defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
 defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
 defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
 
+defm : ScalToVecROLoadPat<ro16, load,       i32, v4f16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, load,       i32, v8f16, LDRHroW, LDRHroX, hsub>;
+
 defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
 defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
 
@@ -1213,6 +1245,7 @@ let Predicates = [IsLE] in {
   defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
 }
 
 defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
@@ -1226,6 +1259,7 @@ let Predicates = [IsLE] in {
   defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8f16,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
 }
 } // AddedComplexity = 10
@@ -1355,6 +1389,8 @@ let Predicates = [IsLE] in {
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
 }
 def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
@@ -1376,6 +1412,8 @@ let Predicates = [IsLE] in {
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
 }
 def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
           (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
@@ -1512,6 +1550,8 @@ let Predicates = [IsLE] in {
             (LDURDi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
             (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
 }
 def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
           (LDURDi GPR64sp:$Rn, simm9:$offset)>;
@@ -1532,6 +1572,8 @@ let Predicates = [IsLE] in {
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
 }
 
 //  anyext -> zext
@@ -1828,6 +1870,7 @@ let Predicates = [IsLE] in {
   defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
 }
 
 defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
@@ -1842,9 +1885,37 @@ let Predicates = [IsLE] in {
   defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
 }
 } // AddedComplexity = 10
 
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
+                              ValueType VecTy, ValueType STy,
+                              SubRegIndex SubRegIdx,
+                              Instruction STRW, Instruction STRX> {
+
+  def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 19 in {
+  defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
+  defm : VecROStoreLane0Pat<ro16,      store   , v8i16, i16, hsub, STRHroW, STRHroX>;
+  defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro32,      store   , v4i32, i32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro32,      store   , v4f32, f32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro64,      store   , v2i64, i64, dsub, STRDroW, STRDroX>;
+  defm : VecROStoreLane0Pat<ro64,      store   , v2f64, f64, dsub, STRDroW, STRDroX>;
+}
+
 //---
 // (unsigned immediate)
 defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
@@ -1892,6 +1963,9 @@ let Predicates = [IsLE] in {
   def : Pat<(store (v2i32 FPR64:$Rt),
                    (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
             (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4f16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
 }
 def : Pat<(store (v1f64 FPR64:$Rt),
                  (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
@@ -1921,6 +1995,9 @@ let Predicates = [IsLE] in {
   def : Pat<(store (v2i64 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8f16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
 }
 def : Pat<(store (f128  FPR128:$Rt),
                  (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
@@ -1983,6 +2060,9 @@ let Predicates = [IsLE] in {
   def : Pat<(store (v2i32 FPR64:$Rt),
                    (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
             (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4f16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
           (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
@@ -2013,6 +2093,9 @@ let Predicates = [IsLE] in {
   def : Pat<(store (v2f64 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8f16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 
 // unscaled i64 truncating stores
@@ -2089,6 +2172,8 @@ def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
@@ -2102,6 +2187,8 @@ def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 //---
 // (immediate post-indexed)
@@ -2139,6 +2226,8 @@ def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
@@ -2152,6 +2241,8 @@ def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 //===----------------------------------------------------------------------===//
 // Load/store exclusive instructions.
@@ -2384,6 +2475,28 @@ defm FMOV : FPMoveImmediate<"fmov">;
 //===----------------------------------------------------------------------===//
 
 defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
+               (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
+          (ABSv8i8 V64:$src)>;
+def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))),
+               (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))),
+          (ABSv4i16 V64:$src)>;
+def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))),
+               (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))),
+          (ABSv2i32 V64:$src)>;
+def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))),
+               (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))),
+          (ABSv16i8 V128:$src)>;
+def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))),
+               (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))),
+          (ABSv8i16 V128:$src)>;
+def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))),
+               (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))),
+          (ABSv4i32 V128:$src)>;
+def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))),
+               (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))),
+          (ABSv2i64 V128:$src)>;
+
 defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
 defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
 defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
@@ -2412,6 +2525,11 @@ def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
                                                     (i64 2))))),
           (FCVTLv4i32 V128:$Rn)>;
 
+def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
+                                                    (i64 4))))),
+          (FCVTLv8i16 V128:$Rn)>;
+
 defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
 defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
 defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
@@ -2423,6 +2541,7 @@ def : Pat<(concat_vectors V64:$Rd,
                           (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
           (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
 def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
           (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
@@ -2505,6 +2624,10 @@ defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>
 defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
 defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
 
+def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
+def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
+def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
+def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
 def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
 def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
 
@@ -3101,6 +3224,46 @@ defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
 defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
                  BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
 
+// Additional patterns for SMULL and UMULL
+multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (INST8B V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (INST4H V64:$Rn, V64:$Rm)>;
+  def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (INST2S V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
+  SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
+defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
+  UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
+
+// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
+multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
+  def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (INST2S  V128:$Rd, V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+  SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+  UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+  SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+  UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
+
 // Patterns for 64-bit pmull
 def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
           (PMULLv1i64 V64:$Rn, V64:$Rm)>;
@@ -3183,6 +3346,10 @@ def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 
 // We use EXT to handle extract_subvector to copy the upper 64-bits of a
 // 128-bit vector.
@@ -3194,6 +3361,8 @@ def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
@@ -3306,6 +3475,19 @@ def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
           (v2f64 (DUPv2i64lane
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
             (i64 0)))>;
+def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
+          (v4f16 (DUPv4i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
+def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
+          (v8f16 (DUPv8i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
+
+def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
+def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
 
 def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
           (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
@@ -3427,6 +3609,23 @@ def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
 def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
           (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
 
+def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
+            (f16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi16lane
+              (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+              (i64 0)),
+            dsub)>;
+
+def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
+            (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
+          (INSvi16lane
+            V128:$Rn, VectorIndexH:$imm,
+            (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+            (i64 0))>;
+
 def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
             (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
           (EXTRACT_SUBREG
@@ -3507,6 +3706,7 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
                 dsub)>;
 }
 
+defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
 defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
 defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
 defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
@@ -3522,6 +3722,8 @@ def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
           (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
 def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
           (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
+          (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
 def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
           (f64 (EXTRACT_SUBREG
             (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
@@ -3532,6 +3734,11 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
             (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
                          V128:$Rn, VectorIndexS:$idx),
             ssub))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
+          (f16 (EXTRACT_SUBREG
+            (INSvi16lane (v8f16 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexH:$idx),
+            hsub))>;
 
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
@@ -3546,6 +3753,7 @@ def : ConcatPat<v2f64, v1f64>;
 def : ConcatPat<v4i32, v2i32>;
 def : ConcatPat<v4f32, v2f32>;
 def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v8f16, v4f16>;
 def : ConcatPat<v16i8, v8i8>;
 
 // If the high lanes are undef, though, we can just ignore them:
@@ -3965,7 +4173,7 @@ def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
 // AdvSIMD indexed element
 //----------------------------------------------------------------------------
 
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
   defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
   defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
 }
@@ -4386,7 +4594,7 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
                                     0),
                                   dsub)),
                                0),
-                             ssub)))>, Requires<[NotForCodeSize]>;
+                             ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
 
 def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
                           (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -4439,8 +4647,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
                                      0),
                                    dsub)),
                                0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-
+                             dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+ 
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
 def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
@@ -4519,7 +4727,7 @@ defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
 defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
 defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
 defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
 defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
 defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
 defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
@@ -4563,6 +4771,10 @@ def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
           (LD1Rv2d GPR64sp:$Rn)>;
 def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
           (LD1Rv1d GPR64sp:$Rn)>;
+def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
 
 class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
                     ValueType VTy, ValueType STy, Instruction LD1>
@@ -4576,6 +4788,7 @@ def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
 def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
 def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
 def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexH, v8f16, f16, LD1i16>;
 
 class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
                    ValueType VTy, ValueType STy, Instruction LD1>
@@ -4590,6 +4803,7 @@ def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
 def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
 def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
 def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexH, v4f16, f16, LD1i16>;
 
 
 defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
@@ -4603,7 +4817,7 @@ defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
 defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
 defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
 
-let AddedComplexity = 15 in
+let AddedComplexity = 19 in
 class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
                     ValueType VTy, ValueType STy, Instruction ST1>
   : Pat<(scalar_store
@@ -4617,8 +4831,9 @@ def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
 def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
 def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
 def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexH, v8f16, f16, ST1i16>;
 
-let AddedComplexity = 15 in
+let AddedComplexity = 19 in
 class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
                    ValueType VTy, ValueType STy, Instruction ST1>
   : Pat<(scalar_store
@@ -4631,6 +4846,7 @@ def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
 def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
 def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
 def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexH, v4f16, f16, ST1i16>;
 
 multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
                              ValueType VTy, ValueType STy, Instruction ST1,
@@ -4655,6 +4871,7 @@ defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
 defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
 defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
 defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
 
 multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
                              ValueType VTy, ValueType STy, Instruction ST1,
@@ -4678,8 +4895,9 @@ defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
 defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
 defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
 defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
 
-let mayStore = 1, neverHasSideEffects = 1 in {
+let mayStore = 1, hasSideEffects = 0 in {
 defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
 defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
 defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
@@ -4856,10 +5074,77 @@ def : Pat<(trap), (BRK 1)>;
 //   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
 //
 
+// Natural vector casts (64 bit)
+def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+// Natural vector casts (128 bit)
+def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 
 def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
@@ -4868,6 +5153,8 @@ def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
@@ -4880,6 +5167,8 @@ def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
                  (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
+                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 
@@ -4889,6 +5178,8 @@ def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
           (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 }
@@ -4917,6 +5208,7 @@ let Predicates = [IsLE] in {
 def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
@@ -4926,6 +5218,8 @@ def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
                              (v1i64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
                              (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
                              (v1i64 (REV64v2i32 FPR64:$src))>;
 }
@@ -4938,6 +5232,7 @@ def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
@@ -4950,6 +5245,8 @@ def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
                              (v2i32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
                              (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
+                             (v2i32 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
 
@@ -4958,6 +5255,7 @@ def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
 }
@@ -4970,6 +5268,8 @@ def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
                              (v4i16 (REV16v8i8 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
                              (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
                              (v4i16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
@@ -4977,12 +5277,41 @@ def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
 }
 
 let Predicates = [IsLE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4f16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+}
+
+
+
+let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))), (v8i8  FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
@@ -4997,6 +5326,8 @@ def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
                              (v8i8 (REV32v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
                              (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
 }
 
 let Predicates = [IsLE] in {
@@ -5004,6 +5335,7 @@ def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))), (f64   FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
@@ -5014,6 +5346,8 @@ def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
                              (f64 (REV64v2i32 FPR64:$src))>;
 def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
                              (f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
@@ -5023,6 +5357,7 @@ def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
@@ -5033,6 +5368,8 @@ def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
                              (v1f64 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
                              (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
@@ -5043,6 +5380,7 @@ def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
@@ -5055,6 +5393,8 @@ def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
                              (v2f32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
                              (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
+                             (v2f32 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
 
@@ -5064,6 +5404,7 @@ def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
@@ -5075,6 +5416,9 @@ def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
 def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
                             (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
                                             (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
                             (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
@@ -5089,6 +5433,7 @@ let Predicates = [IsLE] in {
 def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
 }
@@ -5100,6 +5445,8 @@ def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
                              (v2f64 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
                              (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
                              (v2f64 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
@@ -5110,6 +5457,7 @@ def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
 let Predicates = [IsLE] in {
 def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -5120,6 +5468,8 @@ def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
                                     (REV64v4i32 FPR128:$src), (i32 8)))>;
 def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
                              (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
                              (v4f32 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
@@ -5135,6 +5485,7 @@ def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
@@ -5148,6 +5499,8 @@ def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
                              (v2i64 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
                              (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
 }
 def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 
@@ -5157,6 +5510,7 @@ def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
@@ -5171,6 +5525,8 @@ def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
                              (v4i32 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
                              (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
 }
 def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 
@@ -5181,6 +5537,7 @@ def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
@@ -5197,6 +5554,36 @@ def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
                              (v8i16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
                              (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))),
+                             (v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8f16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
+                             (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8f16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8f16 (REV32v8i16 FPR128:$src))>;
 }
 
 let Predicates = [IsLE] in {
@@ -5206,6 +5593,7 @@ def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
@@ -5222,6 +5610,8 @@ def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
                              (v16i8 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
                              (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
 }
 
 def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
@@ -5245,6 +5635,8 @@ def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 3df9c4f..8157981 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -13,20 +13,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-ldst-opt"
@@ -108,7 +109,7 @@ private:
   int getMemSize(MachineInstr *MemMI);
 };
 char AArch64LoadStoreOpt::ID = 0;
-}
+} // namespace
 
 static bool isUnscaledLdst(unsigned Opc) {
   switch (Opc) {
@@ -931,8 +932,9 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
 
 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
-  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
-  TRI = TM.getRegisterInfo();
+  TII = static_cast<const AArch64InstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
+  TRI = TM.getSubtargetImpl()->getRegisterInfo();
 
   bool Modified = false;
   for (auto &MBB : Fn)
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 75a17b9..e57b0f4 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -25,8 +25,7 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang,
-                                       AsmPrinter &printer)
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer)
     : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
 
 MCSymbol *
diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
index ba50ba9..1e29b80 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64_MCINSTLOWER_H
-#define AArch64_MCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Compiler.h"
@@ -33,7 +33,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
   Triple TargetTriple;
 
 public:
-  AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+  AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer);
 
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/lib/Target/AArch64/AArch64MachineCombinerPattern.h
new file mode 100644
index 0000000..4164b33
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MachineCombinerPattern.h
@@ -0,0 +1,42 @@
+//===- AArch64MachineCombinerPattern.h                                    -===//
+//===- AArch64 instruction pattern supported by combiner                  -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines instruction pattern supported by combiner
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
+
+namespace llvm {
+
+/// Enumeration of instruction pattern supported by machine combiner
+///
+///
+namespace MachineCombinerPattern {
+enum MC_PATTERN : int {
+  MC_NONE = 0,
+  MC_MULADDW_OP1 = 1,
+  MC_MULADDW_OP2 = 2,
+  MC_MULSUBW_OP1 = 3,
+  MC_MULSUBW_OP2 = 4,
+  MC_MULADDWI_OP1 = 5,
+  MC_MULSUBWI_OP1 = 6,
+  MC_MULADDX_OP1 = 7,
+  MC_MULADDX_OP2 = 8,
+  MC_MULSUBX_OP1 = 9,
+  MC_MULSUBX_OP2 = 10,
+  MC_MULADDXI_OP1 = 11,
+  MC_MULSUBXI_OP1 = 12
+};
+} // end namespace MachineCombinerPattern
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 7c257ba..536a8d0 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64MACHINEFUNCTIONINFO_H
-#define AArch64MACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -160,4 +160,4 @@ private:
 };
 } // End llvm namespace
 
-#endif // AArch64MACHINEFUNCTIONINFO_H
+#endif
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
new file mode 100644
index 0000000..f942c4e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -0,0 +1,383 @@
+//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file contains the AArch64 / Cortex-A57 specific register allocation
+// constraints for use by the PBQP register allocator.
+//
+// It is essentially a transcription of what is contained in
+// AArch64A57FPLoadBalancing, which tries to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply-accumulates.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-pbqp"
+
+#include "AArch64.h"
+#include "AArch64PBQPRegAlloc.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocPBQP.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+#ifndef NDEBUG
+bool isFPReg(unsigned reg) {
+  return AArch64::FPR32RegClass.contains(reg) ||
+         AArch64::FPR64RegClass.contains(reg) ||
+         AArch64::FPR128RegClass.contains(reg);
+}
+#endif
+
+bool isOdd(unsigned reg) {
+  switch (reg) {
+  default:
+    llvm_unreachable("Register is not from the expected class !");
+  case AArch64::S1:
+  case AArch64::S3:
+  case AArch64::S5:
+  case AArch64::S7:
+  case AArch64::S9:
+  case AArch64::S11:
+  case AArch64::S13:
+  case AArch64::S15:
+  case AArch64::S17:
+  case AArch64::S19:
+  case AArch64::S21:
+  case AArch64::S23:
+  case AArch64::S25:
+  case AArch64::S27:
+  case AArch64::S29:
+  case AArch64::S31:
+  case AArch64::D1:
+  case AArch64::D3:
+  case AArch64::D5:
+  case AArch64::D7:
+  case AArch64::D9:
+  case AArch64::D11:
+  case AArch64::D13:
+  case AArch64::D15:
+  case AArch64::D17:
+  case AArch64::D19:
+  case AArch64::D21:
+  case AArch64::D23:
+  case AArch64::D25:
+  case AArch64::D27:
+  case AArch64::D29:
+  case AArch64::D31:
+  case AArch64::Q1:
+  case AArch64::Q3:
+  case AArch64::Q5:
+  case AArch64::Q7:
+  case AArch64::Q9:
+  case AArch64::Q11:
+  case AArch64::Q13:
+  case AArch64::Q15:
+  case AArch64::Q17:
+  case AArch64::Q19:
+  case AArch64::Q21:
+  case AArch64::Q23:
+  case AArch64::Q25:
+  case AArch64::Q27:
+  case AArch64::Q29:
+  case AArch64::Q31:
+    return true;
+  case AArch64::S0:
+  case AArch64::S2:
+  case AArch64::S4:
+  case AArch64::S6:
+  case AArch64::S8:
+  case AArch64::S10:
+  case AArch64::S12:
+  case AArch64::S14:
+  case AArch64::S16:
+  case AArch64::S18:
+  case AArch64::S20:
+  case AArch64::S22:
+  case AArch64::S24:
+  case AArch64::S26:
+  case AArch64::S28:
+  case AArch64::S30:
+  case AArch64::D0:
+  case AArch64::D2:
+  case AArch64::D4:
+  case AArch64::D6:
+  case AArch64::D8:
+  case AArch64::D10:
+  case AArch64::D12:
+  case AArch64::D14:
+  case AArch64::D16:
+  case AArch64::D18:
+  case AArch64::D20:
+  case AArch64::D22:
+  case AArch64::D24:
+  case AArch64::D26:
+  case AArch64::D28:
+  case AArch64::D30:
+  case AArch64::Q0:
+  case AArch64::Q2:
+  case AArch64::Q4:
+  case AArch64::Q6:
+  case AArch64::Q8:
+  case AArch64::Q10:
+  case AArch64::Q12:
+  case AArch64::Q14:
+  case AArch64::Q16:
+  case AArch64::Q18:
+  case AArch64::Q20:
+  case AArch64::Q22:
+  case AArch64::Q24:
+  case AArch64::Q26:
+  case AArch64::Q28:
+  case AArch64::Q30:
+    return false;
+
+  }
+}
+
+bool haveSameParity(unsigned reg1, unsigned reg2) {
+  assert(isFPReg(reg1) && "Expecting an FP register for reg1");
+  assert(isFPReg(reg2) && "Expecting an FP register for reg2");
+
+  return isOdd(reg1) == isOdd(reg2);
+}
+
+}
+
+bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
+                                                 unsigned Ra) {
+  if (Rd == Ra)
+    return false;
+
+  LiveIntervals &LIs = G.getMetadata().LIS;
+
+  if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
+    DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
+          << '\n');
+    DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
+          << '\n');
+    return false;
+  }
+
+  PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+  PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(Ra);
+
+  const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+    &G.getNodeMetadata(node1).getAllowedRegs();
+  const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRaAllowed =
+    &G.getNodeMetadata(node2).getAllowedRegs();
+
+  PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+
+  // The edge does not exist. Create one with the appropriate interference
+  // costs.
+  if (edge == G.invalidEdgeId()) {
+    const LiveInterval &ld = LIs.getInterval(Rd);
+    const LiveInterval &la = LIs.getInterval(Ra);
+    bool livesOverlap = ld.overlaps(la);
+
+    PBQPRAGraph::RawMatrix costs(vRdAllowed->size() + 1,
+                                 vRaAllowed->size() + 1, 0);
+    for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+      unsigned pRd = (*vRdAllowed)[i];
+      for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+        unsigned pRa = (*vRaAllowed)[j];
+        if (livesOverlap && TRI->regsOverlap(pRd, pRa))
+          costs[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+        else
+          costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0;
+      }
+    }
+    G.addEdge(node1, node2, std::move(costs));
+    return true;
+  }
+
+  if (G.getEdgeNode1Id(edge) == node2) {
+    std::swap(node1, node2);
+    std::swap(vRdAllowed, vRaAllowed);
+  }
+
+  // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass))
+  PBQPRAGraph::RawMatrix costs(G.getEdgeCosts(edge));
+  for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+    unsigned pRd = (*vRdAllowed)[i];
+
+    // Get the maximum cost (excluding unallocatable reg) for same parity
+    // registers
+    PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+    for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+      unsigned pRa = (*vRaAllowed)[j];
+      if (haveSameParity(pRd, pRa))
+        if (costs[i + 1][j + 1] !=
+                std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+            costs[i + 1][j + 1] > sameParityMax)
+          sameParityMax = costs[i + 1][j + 1];
+    }
+
+    // Ensure all registers with a different parity have a higher cost
+    // than sameParityMax
+    for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+      unsigned pRa = (*vRaAllowed)[j];
+      if (!haveSameParity(pRd, pRa))
+        if (sameParityMax > costs[i + 1][j + 1])
+          costs[i + 1][j + 1] = sameParityMax + 1.0;
+    }
+  }
+  G.setEdgeCosts(edge, std::move(costs));
+
+  return true;
+}
+
+void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
+                                                 unsigned Ra) {
+  LiveIntervals &LIs = G.getMetadata().LIS;
+
+  // Do some Chain management
+  if (Chains.count(Ra)) {
+    if (Rd != Ra) {
+      DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to "
+                   << PrintReg(Rd, TRI) << '\n';);
+      Chains.remove(Ra);
+      Chains.insert(Rd);
+    }
+  } else {
+    DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI)
+                 << '\n';);
+    Chains.insert(Rd);
+  }
+
+  PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+
+  const LiveInterval &ld = LIs.getInterval(Rd);
+  for (auto r : Chains) {
+    // Skip self
+    if (r == Rd)
+      continue;
+
+    const LiveInterval &lr = LIs.getInterval(r);
+    if (ld.overlaps(lr)) {
+      const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+        &G.getNodeMetadata(node1).getAllowedRegs();
+
+      PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(r);
+      const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRrAllowed =
+        &G.getNodeMetadata(node2).getAllowedRegs();
+
+      PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+      assert(edge != G.invalidEdgeId() &&
+             "PBQP error ! The edge should exist !");
+
+      DEBUG(dbgs() << "Refining constraint !\n";);
+
+      if (G.getEdgeNode1Id(edge) == node2) {
+        std::swap(node1, node2);
+        std::swap(vRdAllowed, vRrAllowed);
+      }
+
+      // Enforce that cost is higher with all other Chains of the same parity
+      PBQP::Matrix costs(G.getEdgeCosts(edge));
+      for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+        unsigned pRd = (*vRdAllowed)[i];
+
+        // Get the maximum cost (excluding unallocatable reg) for all other
+        // parity registers
+        PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+        for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+          unsigned pRa = (*vRrAllowed)[j];
+          if (!haveSameParity(pRd, pRa))
+            if (costs[i + 1][j + 1] !=
+                    std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+                costs[i + 1][j + 1] > sameParityMax)
+              sameParityMax = costs[i + 1][j + 1];
+        }
+
+        // Ensure all registers with same parity have a higher cost
+        // than sameParityMax
+        for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+          unsigned pRa = (*vRrAllowed)[j];
+          if (haveSameParity(pRd, pRa))
+            if (sameParityMax > costs[i + 1][j + 1])
+              costs[i + 1][j + 1] = sameParityMax + 1.0;
+        }
+      }
+      G.setEdgeCosts(edge, std::move(costs));
+    }
+  }
+}
+
+static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg,
+                                const MachineInstr &MI) {
+  LiveInterval LI = LIs.getInterval(reg);
+  SlotIndex SI = LIs.getInstructionIndex(&MI);
+  return LI.expiredAt(SI);
+}
+
+void A57ChainingConstraint::apply(PBQPRAGraph &G) {
+  const MachineFunction &MF = G.getMetadata().MF;
+  LiveIntervals &LIs = G.getMetadata().LIS;
+
+  TRI = MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+  DEBUG(MF.dump());
+
+  for (const auto &MBB: MF) {
+    Chains.clear(); // FIXME: really needed ? Could not work at MF level ?
+
+    for (const auto &MI: MBB) {
+
+      // Forget Chains which have expired
+      for (auto r : Chains) {
+        SmallVector<unsigned, 8> toDel;
+        if(regJustKilledBefore(LIs, r, MI)) {
+          DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at ";
+                MI.print(dbgs()););
+          toDel.push_back(r);
+        }
+
+        while (!toDel.empty()) {
+          Chains.remove(toDel.back());
+          toDel.pop_back();
+        }
+      }
+
+      switch (MI.getOpcode()) {
+      case AArch64::FMSUBSrrr:
+      case AArch64::FMADDSrrr:
+      case AArch64::FNMSUBSrrr:
+      case AArch64::FNMADDSrrr:
+      case AArch64::FMSUBDrrr:
+      case AArch64::FMADDDrrr:
+      case AArch64::FNMSUBDrrr:
+      case AArch64::FNMADDDrrr: {
+        unsigned Rd = MI.getOperand(0).getReg();
+        unsigned Ra = MI.getOperand(3).getReg();
+
+        if (addIntraChainConstraint(G, Rd, Ra))
+          addInterChainConstraint(G, Rd, Ra);
+        break;
+      }
+
+      case AArch64::FMLAv2f32:
+      case AArch64::FMLSv2f32: {
+        unsigned Rd = MI.getOperand(0).getReg();
+        addInterChainConstraint(G, Rd, Rd);
+        break;
+      }
+
+      default:
+        break;
+      }
+    }
+  }
+}
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/lib/Target/AArch64/AArch64PBQPRegAlloc.h
new file mode 100644
index 0000000..4f656f9
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.h
@@ -0,0 +1,38 @@
+//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
+
+namespace llvm {
+
+/// Add the accumulator chaining constraint to a PBQP graph
+class A57ChainingConstraint : public PBQPRAConstraint {
+public:
+  // Add A57 specific constraints to the PBQP graph.
+  void apply(PBQPRAGraph &G) override;
+
+private:
+  SmallSetVector<unsigned, 32> Chains;
+  const TargetRegisterInfo *TRI;
+
+  // Add the accumulator chaining constraint, inside the chain, i.e. so that
+  // parity(Rd) == parity(Ra).
+  // \return true if a constraint was added
+  bool addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+
+  // Add constraints between existing chains
+  void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+};
+}
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h
index b22fa24..9e9eec4 100644
--- a/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+
 // 31 entries have cost 0
 // 242 entries have cost 1
 // 1447 entries have cost 2
@@ -6584,3 +6587,5 @@ static const unsigned PerfectShuffleTable[6561+1] = {
   835584U, // <u,u,u,u>: Cost 0 copy LHS
   0
 };
+
+#endif
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 4723cc4..97b0f0e 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -21,18 +21,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -193,7 +193,7 @@ private:
     // Inserting into the DenseMap may invalidate existing iterator.
     // Keep a copy of the key to find the iterator to erase.
     Instruction *OldInstr = IPI->first;
-    InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
+    InsertPts[NewPt] = std::move(IPI->second);
     // Erase IPI.
     IPI = InsertPts.find(OldInstr);
     InsertPts.erase(IPI);
@@ -569,7 +569,7 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) {
         // global. Do not promote constant expressions either, as they may
         // require some code expansion.
         if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
-            AlreadyChecked.insert(Cst))
+            AlreadyChecked.insert(Cst).second)
           LocalChange |= promoteConstant(Cst);
       }
     }
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 01b9587..d734d43 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -76,7 +76,7 @@ AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
 
 BitVector
 AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
@@ -105,7 +105,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
 bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
                                       unsigned Reg) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   switch (Reg) {
   default:
@@ -169,7 +169,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
 
 unsigned
 AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
 }
@@ -236,7 +236,7 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
   // Note that the incoming offset is based on the SP value at function entry,
   // so it'll be negative.
   MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Estimate an offset from the frame pointer.
@@ -326,7 +326,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
-      MF.getTarget().getFrameLowering());
+      MF.getSubtarget().getFrameLowering());
 
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned FrameReg;
@@ -364,7 +364,7 @@ namespace llvm {
 
 unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                                   MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   switch (RC->getID()) {
   default:
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 76af1ed..51a5034 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64REGISTERINFO_H
-#define LLVM_TARGET_AArch64REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
 
 #define GET_REGINFO_HEADER
 #include "AArch64GenRegisterInfo.inc"
@@ -98,4 +98,4 @@ public:
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_AArch64REGISTERINFO_H
+#endif
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index a30e4ad..d5ff3f1 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -390,13 +390,14 @@ def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
 }
 def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
 def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
-                                    v1i64],
+                                    v1i64, v4f16],
                                     64, (sequence "D%u", 0, 31)>;
 // We don't (yet) have an f128 legal type, so don't use that here. We
 // normalize 128-bit vectors to v2f64 for arg passing and such, so use
 // that here.
 def FPR128 : RegisterClass<"AArch64",
-                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128,
+                            v8f16],
                            128, (sequence "Q%u", 0, 31)>;
 
 // The lower 16 vector registers.  Some instructions can only take registers
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 8209f96..3ec4157 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -12,11 +12,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// The Cortex-A57 is a traditional superscaler microprocessor with a
+// conservative 3-wide in-order stage for decode and dispatch. Combined with the
+// much wider out-of-order issue stage, this produced a need to carefully
+// schedule micro-ops so that all three decoded each cycle are successfully
+// issued as the reservation station(s) simply don't stay occupied for long.
+// Therefore, IssueWidth is set to the narrower of the two at three, while still
+// modeling the machine as out-of-order.
+
 def CortexA57Model : SchedMachineModel {
-  let IssueWidth        =   8; // 3-way decode and 8-way issue
+  let IssueWidth        =   3; // 3-way decode and dispatch
   let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
   let LoadLatency       =   4; // Optimistic load latency
   let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling. The magic number is chosen based on
+  // experiments and benchmarking data.
+  let LoopMicroOpBufferSize = 16;
 }
 
 //===----------------------------------------------------------------------===//
@@ -24,18 +37,17 @@ def CortexA57Model : SchedMachineModel {
 // Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
 // micro-ops wait for their operands and then issue out-of-order.
 
-def A57UnitB : ProcResource<1> { let BufferSize = 8; }  // Type B micro-ops
-def A57UnitI : ProcResource<2> { let BufferSize = 8; }  // Type I micro-ops
-def A57UnitM : ProcResource<1> { let BufferSize = 8; }  // Type M micro-ops
-def A57UnitL : ProcResource<1> { let BufferSize = 8; }  // Type L micro-ops
-def A57UnitS : ProcResource<1> { let BufferSize = 8; }  // Type S micro-ops
-def A57UnitX : ProcResource<1> { let BufferSize = 8; }  // Type X micro-ops
-def A57UnitW : ProcResource<1> { let BufferSize = 8; }  // Type W micro-ops
+def A57UnitB : ProcResource<1>;  // Type B micro-ops
+def A57UnitI : ProcResource<2>;  // Type I micro-ops
+def A57UnitM : ProcResource<1>;  // Type M micro-ops
+def A57UnitL : ProcResource<1>;  // Type L micro-ops
+def A57UnitS : ProcResource<1>;  // Type S micro-ops
+def A57UnitX : ProcResource<1>;  // Type X micro-ops
+def A57UnitW : ProcResource<1>;  // Type W micro-ops
 let SchedModel = CortexA57Model in {
   def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
 }
 
-
 let SchedModel = CortexA57Model in {
 
 //===----------------------------------------------------------------------===//
@@ -71,7 +83,7 @@ def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>;
 def : SchedAlias<WriteF,     A57Write_3cyc_1V>;
 def : SchedAlias<WriteFCmp,  A57Write_3cyc_1V>;
 def : SchedAlias<WriteFCvt,  A57Write_5cyc_1V>;
-def : SchedAlias<WriteFCopy, A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
 def : SchedAlias<WriteFImm,  A57Write_3cyc_1V>;
 def : SchedAlias<WriteFMul,  A57Write_5cyc_1V>;
 def : SchedAlias<WriteFDiv,  A57Write_18cyc_1X>;
@@ -85,13 +97,12 @@ def : WriteRes<WriteHint,    []> { let Latency = 1; }
 
 def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
 
-// Forwarding logic is not [yet] explicitly modeled beyond what is captured
-// in the latencies of the A57 Generic SchedWriteRes's.
+// Forwarding logic is only modeled for multiply and accumulate
 def : ReadAdvance<ReadI,       0>;
 def : ReadAdvance<ReadISReg,   0>;
 def : ReadAdvance<ReadIEReg,   0>;
 def : ReadAdvance<ReadIM,      0>;
-def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
 def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
@@ -134,7 +145,13 @@ def : InstRW<[A57Write_2cyc_1M],    (instregex "BFM")>;
 // Cryptography Extensions
 // -----------------------------------------------------------------------------
 
-def : InstRW<[A57Write_3cyc_1W], (instregex "CRC32")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^CRC32")>;
 
 
 // Vector Load
@@ -301,4 +318,330 @@ def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_PO
 def : InstRW<[A57Write_8cyc_8S],                (instregex "ST4Fourv(2d)$")>;
 def : InstRW<[A57Write_8cyc_8S, WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
 
+// Vector - Integer
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v8i8, v4i16, v2i32
+//   Q form - v16i8, v8i16, v4i32
+//   D form - v1i8, v1i16, v1i32, v1i64
+//   Q form - v16i8, v8i16, v4i32, v2i64
+//   D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
+//   Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU]?ADDL?Vv16i8v$")>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+
+// ASIMD multiply, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+def A57WriteIVMA   : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57ReadIVMA4   : SchedReadAdvance<4, [A57WriteIVMA]>;
+def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>;
+
+// ASIMD multiply long
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+def A57WriteIVA    : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57ReadIVA3    : SchedReadAdvance<3, [A57WriteIVA]>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[A57Write_4cyc_1X], (instregex "^SQSHLU")>;
+
+
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A57Write_4cyc_2X], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, complex, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Vector - Floating Point
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v2f32
+//   Q form - v4f32, v2f64
+//   D form - 32, 64
+//   D form - v1i32, v1i64
+//   D form - v2i32
+//   Q form - v4i32, v2i64
+
+// ASIMD FP arith, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FABD|FADD|FSUB)(v2f32|32|64|v2i32p)")>;
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FABD|FADD|FSUB)(v4f32|v2f64|v2i64p)")>;
+
+// ASIMD FP arith, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FADDP(v2f32|32|64|v2i32)")>;
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^FADDP(v4f32|v2f64|v2i64)")>;
+
+// ASIMD FP compare, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v2f32|32|64|v1i32|v2i32|v1i64)")>;
+// ASIMD FP compare, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[A57Write_18cyc_1X], (instregex "FDIVv2f32")>;
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[A57Write_36cyc_2X], (instregex "FDIVv4f32")>;
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[A57Write_64cyc_2X], (instregex "FDIVv2f64")>;
+
+// Note: These were simply duplicated from ASIMD FDIV because of missing documentation
+// ASIMD FP square root, D-form, F32
+def : InstRW<[A57Write_18cyc_1X], (instregex "FSQRTv2f32")>;
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[A57Write_36cyc_2X], (instregex "FSQRTv4f32")>;
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[A57Write_64cyc_2X], (instregex "FSQRTv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?(v2f32)")>;
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FMAX|FMIN)(NM)?(v4f32|v2f64)")>;
+// ASIMD FP max/min, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?P(v2f32|v2i32)")>;
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i64)")>;
+// ASIMD FP max/min, reduce
+def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>;
+
+// ASIMD FP multiply, D-form, FZ
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD FP multiply, Q-form, FZ
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, D-form, FZ
+// ASIMD FP multiply accumulate, Q-form, FZ
+def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10;  }
+def A57ReadFPVMA5  : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>;
+def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+
+// Vector - Miscellaneous
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v8i8, v4i16, v2i32
+//   Q form - v16i8, v8i16, v4i32
+//   D form - v1i8, v1i16, v1i32, v1i64
+//   Q form - v16i8, v8i16, v4i32, v2i64
+
+// ASIMD bitwise insert, Q-form
+def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>;
+
+// ASIMD duplicate, gen reg, D-form and Q-form
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>;
+
+// ASIMD move, saturating
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]QXTU?N")>;
+
+// ASIMD reciprocal estimate, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f64|v4f32|v4i32)")>;
+
+// ASIMD reciprocal step, D-form, FZ
+def : InstRW<[A57Write_9cyc_1V], (instregex "^F(RECP|RSQRT)S(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD reciprocal step, Q-form, FZ
+def : InstRW<[A57Write_9cyc_2V], (instregex "^F(RECP|RSQRT)S(v2f64|v4f32|v4i32)")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[A57Write_3cyc_1V], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[A57Write_9cyc_3V], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[A57Write_12cyc_4V], (instregex "^TB[LX]v8i8Four")>;
+// ASIMD table lookup, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[A57Write_9cyc_5V], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[A57Write_12cyc_7V], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[A57Write_15cyc_9V], (instregex "^TB[LX]v16i8Four")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[A57Write_6cyc_1I_1L], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^INSv")>;
+
+// ASIMD unzip/zip, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Remainder
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>;
+
+def A57WriteFPMA  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57ReadFPMA5  : SchedReadAdvance<5, [A57WriteFPMA]>;
+def A57ReadFPM    : SchedReadAdvance<0>;
+def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[SU]CVTF")>;
+
+def : InstRW<[A57Write_32cyc_1X], (instrs FDIVDrr)>;
+def : InstRW<[A57Write_18cyc_1X], (instrs FDIVSrr)>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(MAX|MIN).+rr")>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT.+r")>;
+
+def : InstRW<[A57Write_32cyc_1X], (instrs FSQRTDr)>;
+def : InstRW<[A57Write_18cyc_1X], (instrs FSQRTSr)>;
+
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpre)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpost)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpre)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpost)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRBpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRBui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRHpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRHui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRQpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURBi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURDi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURHi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURQi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURSi)>;
+
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPDi)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STNPQi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPXi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPDi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpre)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STPQi)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_1I_4S], (instrs STPQpost)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_2I_4S], (instrs STPQpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPXi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRBpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRDpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRDpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroX)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STRQpre)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroW)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroX)>;
+def : InstRW<[A57Write_2cyc_1I_2S], (instrs STRQui)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STURQi)>;
+
 } // SchedModel = CortexA57Model
diff --git a/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
index a8f421b..6f30108 100644
--- a/lib/Target/AArch64/AArch64SchedA57WriteRes.td
+++ b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -28,14 +28,18 @@ def A57Write_5cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 5;  }
 def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
 def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
 def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
-def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; }
-def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; }
+def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18;
+                                                    let ResourceCycles = [18]; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
+                                                    let ResourceCycles = [19]; }
 def A57Write_1cyc_1B  : SchedWriteRes<[A57UnitB]> { let Latency = 1;  }
 def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;  }
 def A57Write_1cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 1;  }
 def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;  }
-def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; }
-def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; }
+def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32;
+                                                    let ResourceCycles = [32]; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35;
+                                                    let ResourceCycles = [35]; }
 def A57Write_3cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 3;  }
 def A57Write_3cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 3;  }
 def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
@@ -53,6 +57,7 @@ def A57Write_6cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 6;  }
 def A57Write_64cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
   let Latency     = 64;
   let NumMicroOps = 2;
+  let ResourceCycles = [32, 32];
 }
 def A57Write_6cyc_1I_1L  : SchedWriteRes<[A57UnitI,
                                           A57UnitL]> {
@@ -137,6 +142,7 @@ def A57Write_2cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
 def A57Write_36cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
   let Latency     = 36;
   let NumMicroOps = 2;
+  let ResourceCycles = [18, 18];
 }
 def A57Write_3cyc_1I_1M  : SchedWriteRes<[A57UnitI,
                                           A57UnitM]> {
@@ -153,6 +159,10 @@ def A57Write_3cyc_1S_1V  : SchedWriteRes<[A57UnitS,
   let Latency     = 3;
   let NumMicroOps = 2;
 }
+def A57Write_3cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
 def A57Write_4cyc_1I_1L  : SchedWriteRes<[A57UnitI,
                                           A57UnitL]> {
   let Latency     = 4;
@@ -295,6 +305,11 @@ def A57Write_9cyc_1L_3V    : SchedWriteRes<[A57UnitL,
   let Latency     = 9;
   let NumMicroOps = 4;
 }
+def A57Write_12cyc_4V      : SchedWriteRes<[A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 12;
+  let NumMicroOps = 4;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -334,6 +349,11 @@ def A57Write_9cyc_2L_3V    : SchedWriteRes<[A57UnitL, A57UnitL,
   let Latency     = 9;
   let NumMicroOps = 5;
 }
+def A57Write_9cyc_5V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -399,7 +419,7 @@ def A57Write_4cyc_1I_4S_2V  : SchedWriteRes<[A57UnitI,
   let Latency     = 4;
   let NumMicroOps = 7;
 }
-def A57Write_6cyc_1I_6S  : SchedWriteRes<[A57UnitI,
+def A57Write_6cyc_1I_6S     : SchedWriteRes<[A57UnitI,
                                           A57UnitS, A57UnitS, A57UnitS,
                                           A57UnitS, A57UnitS, A57UnitS]> {
   let Latency     = 6;
@@ -412,6 +432,12 @@ def A57Write_9cyc_1I_2L_4V  : SchedWriteRes<[A57UnitI,
   let Latency     = 9;
   let NumMicroOps = 7;
 }
+def A57Write_12cyc_7V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 12;
+  let NumMicroOps = 7;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -443,11 +469,11 @@ def A57Write_8cyc_8S  : SchedWriteRes<[A57UnitS, A57UnitS,
 //===----------------------------------------------------------------------===//
 // Define Generic 9 micro-op types
 
-def A57Write_8cyc_1I_8S  : SchedWriteRes<[A57UnitI,
-                                          A57UnitS, A57UnitS,
-                                          A57UnitS, A57UnitS,
-                                          A57UnitS, A57UnitS,
-                                          A57UnitS, A57UnitS]> {
+def A57Write_8cyc_1I_8S     : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
   let Latency     = 8;
   let NumMicroOps = 9;
 }
@@ -459,6 +485,12 @@ def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI,
   let Latency     = 11;
   let NumMicroOps = 9;
 }
+def A57Write_15cyc_9V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 15;
+  let NumMicroOps = 9;
+}
 
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 1bf64fc..0cfd582 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -36,8 +36,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   // instead of memset.
   if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
     const AArch64TargetLowering &TLI =
-        *static_cast<const AArch64TargetLowering *>(
-            DAG.getTarget().getTargetLowering());
+        *DAG.getTarget().getSubtarget<AArch64Subtarget>().getTargetLowering();
 
     EVT IntPtr = TLI.getPointerTy();
     Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 1180eea..11932d2 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64SELECTIONDAGINFO_H
-#define AArch64SELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 45f8ddb..0c36e8f 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -39,7 +39,7 @@ public:
   static char ID;
   AArch64StorePairSuppress() : MachineFunctionPass(ID) {}
 
-  virtual const char *getPassName() const override {
+  const char *getPassName() const override {
     return "AArch64 Store Pair Suppression";
   }
 
@@ -50,7 +50,7 @@ private:
 
   bool isNarrowFPStore(const MachineInstr &MI);
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineTraceMetrics>();
     AU.addPreserved<MachineTraceMetrics>();
@@ -85,8 +85,7 @@ bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB)
 
   // If a subtarget does not define resources for STPQi, bail here.
   if (SCDesc->isValid() && !SCDesc->isVariant()) {
-    unsigned ResLenWithSTP = BBTrace.getResourceLength(
-        ArrayRef<const MachineBasicBlock *>(), SCDesc);
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(None, SCDesc);
     if (ResLenWithSTP > ResLength) {
       DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
                    << " resources " << ResLength << " -> " << ResLenWithSTP
@@ -118,12 +117,13 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
 
 bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
-  TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo());
-  TRI = MF->getTarget().getRegisterInfo();
+  TII =
+      static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo());
+  TRI = MF->getSubtarget().getRegisterInfo();
   MRI = &MF->getRegInfo();
   const TargetSubtargetInfo &ST =
       MF->getTarget().getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
 
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index bb0b72c..47b5d54 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64PBQPRegAlloc.h"
 #include "AArch64Subtarget.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineScheduler.h"
@@ -43,8 +44,8 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
 
 AArch64Subtarget::AArch64Subtarget(const std::string &TT,
                                    const std::string &CPU,
-                                   const std::string &FS, TargetMachine &TM,
-                                   bool LittleEndian)
+                                   const std::string &FS,
+                                   const TargetMachine &TM, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
       HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
       HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
@@ -64,13 +65,7 @@ AArch64Subtarget::AArch64Subtarget(const std::string &TT,
 unsigned char
 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
                                         const TargetMachine &TM) const {
-
-  // Determine whether this is a reference to a definition or a declaration.
-  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
-  // load from stub.
-  bool isDecl = GV->hasAvailableExternallyLinkage();
-  if (GV->isDeclaration() && !GV->isMaterializable())
-    isDecl = true;
+  bool isDecl = GV->isDeclarationForLinker();
 
   // MachO large model always goes via a GOT, simply to get a single 8-byte
   // absolute relocation on all global addresses.
@@ -78,10 +73,15 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
     return AArch64II::MO_GOT;
 
   // The small code mode's direct accesses use ADRP, which cannot necessarily
-  // produce the value 0 (if the code is above 4GB). Therefore they must use the
-  // GOT.
-  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
-    return AArch64II::MO_GOT;
+  // produce the value 0 (if the code is above 4GB).
+  if (TM.getCodeModel() == CodeModel::Small &&
+      GV->isWeakForLinker() && isDecl) {
+    // In PIC mode use the GOT, but in absolute mode use a constant pool load.
+    if (TM.getRelocationModel() == Reloc::Static)
+        return AArch64II::MO_CONSTPOOL;
+    else
+        return AArch64II::MO_GOT;
+  }
 
   // If symbol visibility is hidden, the extra load is not needed if
   // the symbol is definitely defined in the current translation unit.
@@ -128,3 +128,11 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 bool AArch64Subtarget::enableEarlyIfConversion() const {
   return EnableEarlyIfConvert;
 }
+
+std::unique_ptr<PBQPRAConstraint>
+AArch64Subtarget::getCustomPBQPConstraints() const {
+  if (!isCortexA57())
+    return nullptr;
+
+  return llvm::make_unique<A57ChainingConstraint>();
+}
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 52124f6..e2740f1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64SUBTARGET_H
-#define AArch64SUBTARGET_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
 
-#include "AArch64InstrInfo.h"
 #include "AArch64FrameLowering.h"
 #include "AArch64ISelLowering.h"
+#include "AArch64InstrInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64SelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -69,18 +69,27 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   AArch64Subtarget(const std::string &TT, const std::string &CPU,
-		   const std::string &FS, TargetMachine &TM, bool LittleEndian);
+                   const std::string &FS, const TargetMachine &TM,
+                   bool LittleEndian);
 
-  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
-  const AArch64FrameLowering *getFrameLowering() const {
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const AArch64FrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  const AArch64TargetLowering *getTargetLowering() const {
+  const AArch64TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const AArch64InstrInfo *getInstrInfo() const { return &InstrInfo; }
-  const DataLayout *getDataLayout() const { return &DL; }
+  const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const AArch64RegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
   bool enableMachineScheduler() const override { return true; }
+  bool enablePostMachineScheduler() const override {
+    return isCortexA53() || isCortexA57();
+  }
 
   bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
 
@@ -94,12 +103,19 @@ public:
   bool isLittleEndian() const { return DL.isLittleEndian(); }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetIOS() const { return TargetTriple.isiOS(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
 
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
   bool isCyclone() const { return CPUString == "cyclone"; }
+  bool isCortexA57() const { return CPUString == "cortex-a57"; }
+  bool isCortexA53() const { return CPUString == "cortex-a53"; }
+
+  bool useAA() const override { return isCortexA53(); }
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
@@ -126,7 +142,9 @@ public:
                            unsigned NumRegionInstrs) const override;
 
   bool enableEarlyIfConversion() const override;
+
+  std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
 };
 } // End llvm namespace
 
-#endif // AArch64SUBTARGET_H
+#endif
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index f99b90b..d4f19d2 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -12,8 +12,11 @@
 
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
-#include "llvm/PassManager.h"
+#include "AArch64TargetObjectFile.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/IR/Function.h"
+#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
@@ -24,6 +27,10 @@ static cl::opt<bool>
 EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
            cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableMCR("aarch64-mcr",
+                               cl::desc("Enable the machine combiner pass"),
+                               cl::init(true), cl::Hidden);
+
 static cl::opt<bool>
 EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
                      cl::init(true), cl::Hidden);
@@ -59,13 +66,41 @@ EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
                           " to make use of cmpxchg flow-based information"),
                  cl::init(true));
 
+static cl::opt<bool>
+EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
+                        cl::desc("Run early if-conversion"),
+                        cl::init(true));
+
+static cl::opt<bool>
+EnableCondOpt("aarch64-condopt",
+              cl::desc("Enable the condition optimizer pass"),
+              cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
+                cl::desc("Work around Cortex-A53 erratum 835769"),
+                cl::init(false));
+
+static cl::opt<bool>
+EnableGEPOpt("aarch64-gep-opt", cl::Hidden,
+             cl::desc("Enable optimizations on complex GEPs"),
+             cl::init(true));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
   RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
+  RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64Target);
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
+    return make_unique<AArch64_MachoTargetObjectFile>();
 
-  RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget);
-  RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget);
+  return make_unique<AArch64_ELFTargetObjectFile>();
 }
 
 /// TargetMachine ctor - Create an AArch64 architecture model.
@@ -77,10 +112,39 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
                                            CodeGenOpt::Level OL,
                                            bool LittleEndian)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, *this, LittleEndian) {
+      TLOF(createTLOF(Triple(getTargetTriple()))),
+      Subtarget(TT, CPU, FS, *this, LittleEndian), isLittle(LittleEndian) {
   initAsmInfo();
 }
 
+AArch64TargetMachine::~AArch64TargetMachine() {}
+
+const AArch64Subtarget *
+AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
+  AttributeSet FnAttrs = F.getAttributes();
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, isLittle);
+  }
+  return I.get();
+}
+
 void AArch64leTargetMachine::anchor() { }
 
 AArch64leTargetMachine::
@@ -104,7 +168,10 @@ namespace {
 class AArch64PassConfig : public TargetPassConfig {
 public:
   AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {
+    if (TM->getOptLevel() != CodeGenOpt::None)
+      substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
+  }
 
   AArch64TargetMachine &getAArch64TargetMachine() const {
     return getTM<AArch64TargetMachine>();
@@ -114,10 +181,10 @@ public:
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
-  bool addPreRegAlloc() override;
-  bool addPostRegAlloc() override;
-  bool addPreSched2() override;
-  bool addPreEmitPass() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
 };
 } // namespace
 
@@ -136,7 +203,7 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
 void AArch64PassConfig::addIRPasses() {
   // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
   // ourselves.
-  addPass(createAtomicExpandLoadLinkedPass(TM));
+  addPass(createAtomicExpandPass(TM));
 
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
@@ -145,6 +212,19 @@ void AArch64PassConfig::addIRPasses() {
     addPass(createCFGSimplificationPass());
 
   TargetPassConfig::addIRPasses();
+
+  if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
+    // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+    // and lower a GEP with multiple indices to either arithmetic operations or
+    // multiple GEPs with single index.
+    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    // Call EarlyCSE pass to find and remove subexpressions in the lowered
+    // result.
+    addPass(createEarlyCSEPass());
+    // Do loop invariant code motion in case part of the lowered result is
+    // invariant.
+    addPass(createLICMPass());
+  }
 }
 
 // Pass Pipeline Configuration
@@ -174,43 +254,56 @@ bool AArch64PassConfig::addInstSelector() {
 }
 
 bool AArch64PassConfig::addILPOpts() {
+  if (EnableCondOpt)
+    addPass(createAArch64ConditionOptimizerPass());
   if (EnableCCMP)
     addPass(createAArch64ConditionalCompares());
-  addPass(&EarlyIfConverterID);
+  if (EnableMCR)
+    addPass(&MachineCombinerID);
+  if (EnableEarlyIfConversion)
+    addPass(&EarlyIfConverterID);
   if (EnableStPairSuppress)
     addPass(createAArch64StorePairSuppressPass());
   return true;
 }
 
-bool AArch64PassConfig::addPreRegAlloc() {
+void AArch64PassConfig::addPreRegAlloc() {
   // Use AdvSIMD scalar instructions whenever profitable.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) {
     addPass(createAArch64AdvSIMDScalar());
-  return true;
+    // The AdvSIMD pass may produce copies that can be rewritten to
+    // be register coaleascer friendly.
+    addPass(&PeepholeOptimizerID);
+  }
 }
 
-bool AArch64PassConfig::addPostRegAlloc() {
+void AArch64PassConfig::addPostRegAlloc() {
   // Change dead register definitions to refer to the zero register.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
     addPass(createAArch64DeadRegisterDefinitions());
-  return true;
+  if (TM->getOptLevel() != CodeGenOpt::None &&
+      (TM->getSubtarget<AArch64Subtarget>().isCortexA53() ||
+       TM->getSubtarget<AArch64Subtarget>().isCortexA57()) &&
+      usingDefaultRegAlloc())
+    // Improve performance for some FP/SIMD code for A57.
+    addPass(createAArch64A57FPLoadBalancing());
 }
 
-bool AArch64PassConfig::addPreSched2() {
+void AArch64PassConfig::addPreSched2() {
   // Expand some pseudo instructions to allow proper scheduling.
   addPass(createAArch64ExpandPseudoPass());
   // Use load/store pair instructions when possible.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
     addPass(createAArch64LoadStoreOptimizationPass());
-  return true;
 }
 
-bool AArch64PassConfig::addPreEmitPass() {
+void AArch64PassConfig::addPreEmitPass() {
+  if (EnableA53Fix835769)
+    addPass(createAArch64A53Fix835769());
   // Relax conditional branch instructions if they're otherwise out of
   // range of their destination.
   addPass(createAArch64BranchRelaxation());
   if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
       TM->getSubtarget<AArch64Subtarget>().isTargetMachO())
     addPass(createAArch64CollectLOHPass());
-  return true;
 }
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 852cb3f..75c65c5 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64TARGETMACHINE_H
-#define AArch64TARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
 
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
@@ -23,7 +23,9 @@ namespace llvm {
 
 class AArch64TargetMachine : public LLVMTargetMachine {
 protected:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AArch64Subtarget Subtarget;
+  mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap;
 
 public:
   AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
@@ -31,33 +33,25 @@ public:
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL, bool IsLittleEndian);
 
+  ~AArch64TargetMachine() override;
+
   const AArch64Subtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
-  const AArch64TargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
-  const AArch64FrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  const AArch64InstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const AArch64RegisterInfo *getRegisterInfo() const override {
-    return &getInstrInfo()->getRegisterInfo();
-  }
-  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
+  const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   /// \brief Register AArch64 analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile* getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+private:
+  bool isLittle;
 };
 
 // AArch64leTargetMachine - AArch64 little endian target machine.
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index de63cb4..2e595f9 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1dac14b..b1a2914 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -51,7 +51,7 @@ public:
 
   AArch64TTI(const AArch64TargetMachine *TM)
       : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
+        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
     initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
   }
 
@@ -104,7 +104,7 @@ public:
     return 64;
   }
 
-  unsigned getMaximumUnrollFactor() const override { return 2; }
+  unsigned getMaxInterleaveFactor() const override;
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
       override;
@@ -112,10 +112,11 @@ public:
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
       override;
 
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                  OperandValueKind Opd1Info = OK_AnyValue,
-                                  OperandValueKind Opd2Info = OK_AnyValue) const
-      override;
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
+      OperandValueKind Opd2Info = OK_AnyValue,
+      OperandValueProperties Opd1PropInfo = OP_None,
+      OperandValueProperties Opd2PropInfo = OP_None) const override;
 
   unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
 
@@ -124,6 +125,13 @@ public:
 
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                            unsigned AddressSpace) const override;
+
+  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override;
+
+  void getUnrollingPreferences(const Function *F, Loop *L,
+                               UnrollingPreferences &UP) const override;
+
+
   /// @}
 };
 
@@ -400,18 +408,42 @@ unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
   return 2;
 }
 
-unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind Opd1Info,
-                                          OperandValueKind Opd2Info) const {
+unsigned AArch64TTI::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
+    OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
+    OperandValueProperties Opd2PropInfo) const {
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
+  if (ISD == ISD::SDIV &&
+      Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+    // On AArch64, scalar signed division by constants power-of-two are
+    // normally expanded to the sequence ADD + CMP + SELECT + SRA.
+    // The OperandValue properties many not be same as that of previous
+    // operation; conservatively assume OP_None.
+    unsigned Cost =
+      getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                             TargetTransformInfo::OP_None,
+                             TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    return Cost;
+  }
+
   switch (ISD) {
   default:
-    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
-                                                       Opd2Info);
+    return TargetTransformInfo::getArithmeticInstrCost(
+        Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -498,3 +530,27 @@ unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
 
   return LT.first;
 }
+
+unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const {
+  unsigned Cost = 0;
+  for (auto *I : Tys) {
+    if (!I->isVectorTy())
+      continue;
+    if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
+      Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
+        getMemoryOpCost(Instruction::Load, I, 128, 0);
+  }
+  return Cost;
+}
+
+unsigned AArch64TTI::getMaxInterleaveFactor() const {
+  if (ST->isCortexA57())
+    return 4;
+  return 2;
+}
+
+void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L,
+                                         UnrollingPreferences &UP) const {
+  // Disable partial & runtime unrolling on -Os.
+  UP.PartialOptSizeThreshold = 0;
+}
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 37e9296..8eb906b 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -10,27 +10,28 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
 #include <cstdio>
 using namespace llvm;
 
@@ -42,7 +43,6 @@ class AArch64AsmParser : public MCTargetAsmParser {
 private:
   StringRef Mnemonic; ///< Instruction mnemonic.
   MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
 
   // Map of register aliases registers via the .req directive.
   StringMap<std::pair<bool, unsigned> > RegisterReqs;
@@ -52,10 +52,7 @@ private:
     return static_cast<AArch64TargetStreamer &>(TS);
   }
 
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
-  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+  SMLoc getLoc() const { return getParser().getTok().getLoc(); }
 
   bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
   AArch64CC::CondCode parseCondCodeString(StringRef Cond);
@@ -69,11 +66,13 @@ private:
   bool parseOperand(OperandVector &Operands, bool isCondCode,
                     bool invertCondCode);
 
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); }
   bool showMatchError(SMLoc Loc, unsigned ErrCode);
 
   bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveInst(SMLoc L);
+
   bool parseDirectiveTLSDescCall(SMLoc L);
 
   bool parseDirectiveLOH(StringRef LOH, SMLoc L);
@@ -85,7 +84,7 @@ private:
   bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
+                               uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
 /// @name Auto-generated Match Functions
 /// {
@@ -117,10 +116,11 @@ public:
   AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
                  const MCInstrInfo &MII,
                  const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+      : MCTargetAsmParser(), STI(_STI) {
     MCAsmParserExtension::Initialize(_Parser);
-    if (Parser.getStreamer().getTargetStreamer() == nullptr)
-      new AArch64TargetStreamer(Parser.getStreamer());
+    MCStreamer &S = getParser().getStreamer();
+    if (S.getTargetStreamer() == nullptr)
+      new AArch64TargetStreamer(S);
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -1875,6 +1875,7 @@ unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
 /// Identifier when called, and if it is a register name the token is eaten and
 /// the register is added to the operand list.
 int AArch64AsmParser::tryParseRegister() {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
 
@@ -1899,6 +1900,7 @@ int AArch64AsmParser::tryParseRegister() {
 /// tryMatchVectorRegister - Try to parse a vector register name with optional
 /// kind specifier. If it is a register specifier, eat the token and return it.
 int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+  MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
     TokError("vector register expected");
     return -1;
@@ -1931,6 +1933,7 @@ int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
 /// tryParseSysCROperand - Try to parse a system instruction CR operand name.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
@@ -1960,6 +1963,7 @@ AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
 /// tryParsePrefetch - Try to parse a prefetch operand.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const AsmToken &Tok = Parser.getTok();
   // Either an identifier for named values or a 5-bit immediate.
@@ -2007,6 +2011,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
 /// instruction.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const MCExpr *Expr;
 
@@ -2057,6 +2062,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
 /// instruction.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const MCExpr *Expr;
 
@@ -2076,6 +2082,7 @@ AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
 /// tryParseFPImm - A floating point immediate expression operand.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
   bool Hash = false;
@@ -2138,6 +2145,7 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
 /// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
   if (Parser.getTok().is(AsmToken::Hash))
@@ -2229,6 +2237,7 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
 /// parseCondCode - Parse a Condition Code operand.
 bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
                                      bool invertCondCode) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
@@ -2254,6 +2263,7 @@ bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
 /// them if present.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   std::string LowerID = Tok.getString().lower();
   AArch64_AM::ShiftExtendType ShOp =
@@ -2299,10 +2309,11 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
   if (Hash)
     Parser.Lex(); // Eat the '#'.
 
-  // Make sure we do actually have a number
-  if (!Parser.getTok().is(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(),
-          "expected integer shift amount");
+  // Make sure we do actually have a number or a parenthesized expression.
+  SMLoc E = Parser.getTok().getLoc();
+  if (!Parser.getTok().is(AsmToken::Integer) &&
+      !Parser.getTok().is(AsmToken::LParen)) {
+    Error(E, "expected integer shift amount");
     return MatchOperand_ParseFail;
   }
 
@@ -2312,11 +2323,11 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
 
   const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
   if (!MCE) {
-    TokError("expected #imm after shift specifier");
+    Error(E, "expected constant '#imm' after shift specifier");
     return MatchOperand_ParseFail;
   }
 
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
   Operands.push_back(AArch64Operand::CreateShiftExtend(
       ShOp, MCE->getValue(), true, S, E, getContext()));
   return MatchOperand_Success;
@@ -2333,6 +2344,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
   Operands.push_back(
       AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
 
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   StringRef Op = Tok.getString();
   SMLoc S = Tok.getLoc();
@@ -2571,6 +2583,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
 
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
 
   // Can be either a #imm style literal or an option name
@@ -2624,6 +2637,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
 
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
 
   if (Tok.isNot(AsmToken::Identifier))
@@ -2638,6 +2652,7 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
 
 /// tryParseVectorRegister - Parse a vector register operand.
 bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::Identifier))
     return true;
 
@@ -2686,6 +2701,7 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
 
 /// parseRegister - Parse a non-vector register operand.
 bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   // Try for a vector register.
   if (!tryParseVectorRegister(Operands))
@@ -2728,6 +2744,7 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
 }
 
 bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+  MCAsmParser &Parser = getParser();
   bool HasELFModifier = false;
   AArch64MCExpr::VariantKind RefKind;
 
@@ -2806,6 +2823,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
 
 /// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
 bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
   SMLoc S = getLoc();
   Parser.Lex(); // Eat left bracket token.
@@ -2904,6 +2922,7 @@ bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
 
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
     return MatchOperand_NoMatch;
@@ -2949,6 +2968,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
 /// operand regardless of the mnemonic.
 bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
                                   bool invertCondCode) {
+  MCAsmParser &Parser = getParser();
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -3114,6 +3134,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
 bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                         StringRef Name, SMLoc NameLoc,
                                         OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   Name = StringSwitch<StringRef>(Name.lower())
              .Case("beq", "b.eq")
              .Case("bne", "b.ne")
@@ -3562,12 +3583,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
   }
 }
 
-static const char *getSubtargetFeatureName(unsigned Val);
+static const char *getSubtargetFeatureName(uint64_t Val);
 
 bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
                                                MCStreamer &Out,
-                                               unsigned &ErrorInfo,
+                                               uint64_t &ErrorInfo,
                                                bool MatchingInlineAsm) {
   assert(!Operands.empty() && "Unexpect empty operand list!");
   AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
@@ -3817,7 +3838,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     // Special case the error message for the very common case where only
     // a single subtarget feature is missing (neon, e.g.).
     std::string Msg = "instruction requires:";
-    unsigned Mask = 1;
+    uint64_t Mask = 1;
     for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
       if (ErrorInfo & Mask) {
         Msg += " ";
@@ -3831,7 +3852,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return showMatchError(IDLoc, MatchResult);
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
+    if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
@@ -3906,11 +3927,15 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
 
   llvm_unreachable("Implement any new match types added!");
-  return true;
 }
 
 /// ParseDirective parses the arm specific directives
 bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
+  const MCObjectFileInfo::Environment Format =
+    getContext().getObjectFileInfo()->getObjectFileType();
+  bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+  bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
+
   StringRef IDVal = DirectiveID.getIdentifier();
   SMLoc Loc = DirectiveID.getLoc();
   if (IDVal == ".hword")
@@ -3926,12 +3951,18 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".unreq")
     return parseDirectiveUnreq(DirectiveID.getLoc());
 
+  if (!IsMachO && !IsCOFF) {
+    if (IDVal == ".inst")
+      return parseDirectiveInst(Loc);
+  }
+
   return parseDirectiveLOH(IDVal, Loc);
 }
 
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
@@ -3954,6 +3985,47 @@ bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
+/// parseDirectiveInst
+///  ::= .inst opcode [, ...]
+bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    Error(Loc, "expected expression following directive");
+    return false;
+  }
+
+  for (;;) {
+    const MCExpr *Expr;
+
+    if (getParser().parseExpression(Expr)) {
+      Error(Loc, "expected expression");
+      return false;
+    }
+
+    const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+    if (!Value) {
+      Error(Loc, "expected constant expression");
+      return false;
+    }
+
+    getTargetStreamer().emitInst(Value->getValue());
+
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      Error(Loc, "unexpected token in directive");
+      return false;
+    }
+
+    Parser.Lex(); // Eat comma.
+  }
+
+  Parser.Lex();
+  return false;
+}
+
 // parseDirectiveTLSDescCall:
 //   ::= .tlsdesccall symbol
 bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
@@ -3985,10 +4057,9 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
     // We successfully get a numeric value for the identifier.
     // Check if it is valid.
     int64_t Id = getParser().getTok().getIntVal();
-    Kind = (MCLOHType)Id;
-    // Check that Id does not overflow MCLOHType.
-    if (!isValidMCLOHType(Kind) || Id != Kind)
+    if (Id <= -1U && !isValidMCLOHType(Id))
       return TokError("invalid numeric identifier in directive");
+    Kind = (MCLOHType)Id;
   } else {
     StringRef Name = getTok().getIdentifier();
     // We successfully parse an identifier.
@@ -4036,6 +4107,7 @@ bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
 /// parseDirectiveReq
 ///  ::= name .req registername
 bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   Parser.Lex(); // Eat the '.req' token.
   SMLoc SRegLoc = getLoc();
   unsigned RegNum = tryParseRegister();
@@ -4067,7 +4139,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
   Parser.Lex(); // Consume the EndOfStatement
 
   auto pair = std::make_pair(IsVector, RegNum);
-  if (RegisterReqs.GetOrCreateValue(Name, pair).getValue() != pair)
+  if (!RegisterReqs.insert(std::make_pair(Name, pair)).second)
     Warning(L, "ignoring redefinition of register alias '" + Name + "'");
 
   return true;
@@ -4076,6 +4148,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
 /// parseDirectiveUneq
 ///  ::= .unreq registername
 bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
     Error(Parser.getTok().getLoc(), "unexpected input in .unreq directive.");
     Parser.eatToEndOfStatement();
@@ -4140,9 +4213,7 @@ AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
 extern "C" void LLVMInitializeAArch64AsmParser() {
   RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
   RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
-
-  RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget);
-  RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget);
+  RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64Target);
 }
 
 #define GET_REGISTER_MATCHER
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 789d549..f26327f 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -2,7 +2,7 @@ set(LLVM_TARGET_DEFINITIONS AArch64.td)
 
 tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
 tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
@@ -15,6 +15,7 @@ tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
 add_public_tablegen_target(AArch64CommonTableGen)
 
 add_llvm_target(AArch64CodeGen
+  AArch64A57FPLoadBalancing.cpp
   AArch64AddressTypePromotion.cpp
   AArch64AdvSIMDScalarPass.cpp
   AArch64AsmPrinter.cpp
@@ -25,13 +26,16 @@ add_llvm_target(AArch64CodeGen
   AArch64DeadRegisterDefinitionsPass.cpp
   AArch64ExpandPseudoInsts.cpp
   AArch64FastISel.cpp
+  AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
+  AArch64ConditionOptimizer.cpp
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
   AArch64LoadStoreOptimizer.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
+  AArch64PBQPRegAlloc.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
   AArch64StorePairSuppress.cpp
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 6de27d6..878e29c 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -15,12 +15,11 @@
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -200,26 +199,24 @@ static MCDisassembler *createAArch64Disassembler(const Target &T,
 }
 
 DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                               const MemoryObject &Region,
-                                               uint64_t Address,
-                                               raw_ostream &os,
-                                               raw_ostream &cs) const {
-  CommentStream = &cs;
-
-  uint8_t bytes[4];
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &OS,
+                                                 raw_ostream &CS) const {
+  CommentStream = &CS;
 
   Size = 0;
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+  if (Bytes.size() < 4)
     return Fail;
   Size = 4;
 
   // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn =
-      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
+  uint32_t Insn =
+      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
 
   // Calling the auto-generated decoder function.
-  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
+  return decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
 }
 
 static MCSymbolizer *
@@ -243,13 +240,9 @@ extern "C" void LLVMInitializeAArch64Disassembler() {
   TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
                                        createAArch64ExternalSymbolizer);
 
-  TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
+  TargetRegistry::RegisterMCDisassembler(TheARM64Target,
                                          createAArch64Disassembler);
-  TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
-                                       createAArch64ExternalSymbolizer);
-  TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
+  TargetRegistry::RegisterMCSymbolizer(TheARM64Target,
                                        createAArch64ExternalSymbolizer);
 }
 
@@ -592,7 +585,7 @@ static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
                                                uint64_t Addr,
                                                const void *Decoder) {
   // scale{5} is asserted as 1 in tblgen.
-  Imm |= 0x20;  
+  Imm |= 0x20;
   Inst.addOperand(MCOperand::CreateImm(64 - Imm));
   return Success;
 }
@@ -614,7 +607,7 @@ static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
   if (ImmVal & (1 << (19 - 1)))
     ImmVal |= ~((1LL << 19) - 1);
 
-  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
+  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal *  4, Addr,
                                      Inst.getOpcode() != AArch64::LDRXl, 0, 4))
     Inst.addOperand(MCOperand::CreateImm(ImmVal));
   return Success;
@@ -630,35 +623,19 @@ static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
 static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
-  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
-  Imm |= 0x8000;
   Inst.addOperand(MCOperand::CreateImm(Imm));
 
-  bool ValidNamed;
-  (void)AArch64SysReg::MRSMapper(STI.getFeatureBits())
-      .toString(Imm, ValidNamed);
-
-  return ValidNamed ? Success : Fail;
+  // Every system register in the encoding space is valid with the syntax
+  // S<op0>_<op1>_<Cn>_<Cm>_<op2>, so decoding system registers always succeeds.
+  return Success;
 }
 
 static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
-  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
-  Imm |= 0x8000;
   Inst.addOperand(MCOperand::CreateImm(Imm));
 
-  bool ValidNamed;
-  (void)AArch64SysReg::MSRMapper(STI.getFeatureBits())
-      .toString(Imm, ValidNamed);
-
-  return ValidNamed ? Success : Fail;
+  return Success;
 }
 
 static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
@@ -1510,7 +1487,7 @@ static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
   if (imm & (1 << (26 - 1)))
     imm |= ~((1LL << 26) - 1);
 
-  if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4))
     Inst.addOperand(MCOperand::CreateImm(imm));
 
   return Success;
@@ -1530,7 +1507,7 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
 
   bool ValidNamed;
   (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
-  
+
   return ValidNamed ? Success : Fail;
 }
 
@@ -1552,7 +1529,7 @@ static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
   else
     DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
   Inst.addOperand(MCOperand::CreateImm(bit));
-  if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
+  if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4))
     Inst.addOperand(MCOperand::CreateImm(dst));
 
   return Success;
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 68d4867..7fb57ad 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64DISASSEMBLER_H
-#define AArch64DISASSEMBLER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
 
 #include "llvm/MC/MCDisassembler.h"
 
@@ -28,11 +28,10 @@ public:
 
   ~AArch64Disassembler() {}
 
-  /// getInstruction - See MCDisassembler.
   MCDisassembler::DecodeStatus
-  getInstruction(MCInst &instr, uint64_t &size, const MemoryObject &region,
-                 uint64_t address, raw_ostream &vStream,
-                 raw_ostream &cStream) const override;
+  getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
+                 uint64_t Address, raw_ostream &VStream,
+                 raw_ostream &CStream) const override;
 };
 
 } // namespace llvm
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index 171d31c..12b8450 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64EXTERNALSYMBOLIZER_H
-#define AArch64EXTERNALSYMBOLIZER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
 
 #include "llvm/MC/MCExternalSymbolizer.h"
 
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
index a4224f4..62827e8 100644
--- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = AArch64Disassembler
 parent = AArch64
-required_libraries = AArch64Info AArch64Utils MC Support
+required_libraries = AArch64Info AArch64Utils MC MCDisassembler Support
 add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 8a21f06..46a1d79 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -16,8 +16,8 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
@@ -1223,7 +1223,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 2);
+    O << "#" << (Op.getImm() * 4);
     return;
   }
 
@@ -1247,7 +1247,7 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 12);
+    O << "#" << (Op.getImm() * (1 << 12));
     return;
   }
 
@@ -1276,24 +1276,20 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  bool Valid;
   auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val, Valid);
+  std::string Name = Mapper.toString(Val);
 
-  if (Valid)
-    O << StringRef(Name).upper();
+  O << StringRef(Name).upper();
 }
 
 void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  bool Valid;
   auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val, Valid);
+  std::string Name = Mapper.toString(Val);
 
-  if (Valid)
-    O << StringRef(Name).upper();
+  O << StringRef(Name).upper();
 }
 
 void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index fe7666e..5f51621 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64INSTPRINTER_H
-#define AArch64INSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
+#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/StringRef.h"
@@ -127,8 +127,9 @@ public:
 
   void printInstruction(const MCInst *MI, raw_ostream &O) override;
   bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
-  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                                       unsigned PrintMethodIdx, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx,
+                               raw_ostream &O) override;
   StringRef getRegName(unsigned RegNo) const override {
     return getRegisterName(RegNo);
   }
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 642c183..573fa10 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = AArch64CodeGen
 parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
 add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 8b1e44e2..4db9dea 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
-#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -51,7 +51,7 @@ enum ShiftExtendType {
 /// getShiftName - Get the string encoding for the shift type.
 static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
   switch (ST) {
-  default: assert(false && "unhandled shift type!");
+  default: llvm_unreachable("unhandled shift type!");
   case AArch64_AM::LSL: return "lsl";
   case AArch64_AM::LSR: return "lsr";
   case AArch64_AM::ASR: return "asr";
@@ -210,67 +210,63 @@ static inline uint64_t ror(uint64_t elt, unsigned size) {
 /// as the immediate operand of a logical instruction for the given register
 /// size.  If so, return true with "encoding" set to the encoded value in
 /// the form N:immr:imms.
-static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
-                                           uint64_t &encoding) {
-  if (imm == 0ULL || imm == ~0ULL ||
-      (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
+static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize,
+                                           uint64_t &Encoding) {
+  if (Imm == 0ULL || Imm == ~0ULL ||
+      (RegSize != 64 && (Imm >> RegSize != 0 || Imm == ~0U)))
     return false;
 
-  unsigned size = 2;
-  uint64_t eltVal = imm;
-
   // First, determine the element size.
-  while (size < regSize) {
-    unsigned numElts = regSize / size;
-    unsigned mask = (1ULL << size) - 1;
-    uint64_t lowestEltVal = imm & mask;
-
-    bool allMatched = true;
-    for (unsigned i = 1; i < numElts; ++i) {
-     uint64_t currEltVal = (imm >> (i*size)) & mask;
-      if (currEltVal != lowestEltVal) {
-        allMatched = false;
-        break;
-      }
-    }
+  unsigned Size = RegSize;
+
+  do {
+    Size /= 2;
+    uint64_t Mask = (1ULL << Size) - 1;
 
-    if (allMatched) {
-      eltVal = lowestEltVal;
+    if ((Imm & Mask) != ((Imm >> Size) & Mask)) {
+      Size *= 2;
       break;
     }
-
-    size *= 2;
-  }
+  } while (Size > 2);
 
   // Second, determine the rotation to make the element be: 0^m 1^n.
-  for (unsigned i = 0; i < size; ++i) {
-    eltVal = ror(eltVal, size);
-    uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
-    uint32_t cto = CountTrailingOnes_64(eltVal);
-
-    if (clz + cto == size) {
-      // Encode in immr the number of RORs it would take to get *from* this
-      // element value to our target value, where i+1 is the number of RORs
-      // to go the opposite direction.
-      unsigned immr = size - (i + 1);
-
-      // If size has a 1 in the n'th bit, create a value that has zeroes in
-      // bits [0, n] and ones above that.
-      uint64_t nimms = ~(size-1) << 1;
-
-      // Or the CTO value into the low bits, which must be below the Nth bit
-      // bit mentioned above.
-      nimms |= (cto-1);
-
-      // Extract the seventh bit and toggle it to create the N field.
-      unsigned N = ((nimms >> 6) & 1) ^ 1;
-
-      encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
-      return true;
-    }
+  uint32_t CTO, I;
+  uint64_t Mask = ((uint64_t)-1LL) >> (64 - Size);
+  Imm &= Mask;
+
+  if (isShiftedMask_64(Imm)) {
+    I = countTrailingZeros(Imm);
+    assert(I < 64 && "undefined behavior");
+    CTO = CountTrailingOnes_64(Imm >> I);
+  } else {
+    Imm |= ~Mask;
+    if (!isShiftedMask_64(~Imm))
+      return false;
+
+    unsigned CLO = CountLeadingOnes_64(Imm);
+    I = 64 - CLO;
+    CTO = CLO + CountTrailingOnes_64(Imm) - (64 - Size);
   }
 
-  return false;
+  // Encode in Immr the number of RORs it would take to get *from* 0^m 1^n
+  // to our target value, where I is the number of RORs to go the opposite
+  // direction.
+  assert(Size > I && "I should be smaller than element size");
+  unsigned Immr = (Size - I) & (Size - 1);
+
+  // If size has a 1 in the n'th bit, create a value that has zeroes in
+  // bits [0, n] and ones above that.
+  uint64_t NImms = ~(Size-1) << 1;
+
+  // Or the CTO value into the low bits, which must be below the Nth bit
+  // bit mentioned above.
+  NImms |= (CTO-1);
+
+  // Extract the seventh bit and toggle it to create the N field.
+  unsigned N = ((NImms >> 6) & 1) ^ 1;
+
+  Encoding = (N << 12) | (Immr << 6) | (NImms & 0x3f);
+  return true;
 }
 
 /// isLogicalImmediate - Return true if the immediate is valid for a logical
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a917616..423da65 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -13,10 +13,11 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachO.h"
 using namespace llvm;
@@ -131,7 +132,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
-    assert(false && "Unknown fixup kind!");
+    llvm_unreachable("Unknown fixup kind!");
   case AArch64::fixup_aarch64_pcrel_adr_imm21:
     if (SignedValue > 2097151 || SignedValue < -2097152)
       report_fatal_error("fixup value out of range");
@@ -238,7 +239,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
 
 void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
                                          MCInst &Res) const {
-  assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented");
+  llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
 }
 
 bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
@@ -316,42 +317,6 @@ public:
                                          MachO::CPU_SUBTYPE_ARM64_ALL);
   }
 
-  bool doesSectionRequireSymbols(const MCSection &Section) const override {
-    // Any section for which the linker breaks things into atoms needs to
-    // preserve symbols, including assembler local symbols, to identify
-    // those atoms. These sections are:
-    // Sections of type:
-    //
-    //    S_CSTRING_LITERALS  (e.g. __cstring)
-    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
-    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
-    //
-    // Sections named:
-    //
-    //    __TEXT,__eh_frame
-    //    __TEXT,__ustring
-    //    __DATA,__cfstring
-    //    __DATA,__objc_classrefs
-    //    __DATA,__objc_catlist
-    //
-    // FIXME: It would be better if the compiler used actual linker local
-    // symbols for each of these sections rather than preserving what
-    // are ostensibly assembler local symbols.
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
-    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
-            SMO.getType() == MachO::S_4BYTE_LITERALS ||
-            SMO.getType() == MachO::S_8BYTE_LITERALS ||
-            SMO.getType() == MachO::S_16BYTE_LITERALS ||
-            SMO.getType() == MachO::S_LITERAL_POINTERS ||
-            (SMO.getSegmentName() == "__TEXT" &&
-             (SMO.getSectionName() == "__eh_frame" ||
-              SMO.getSectionName() == "__ustring")) ||
-            (SMO.getSegmentName() == "__DATA" &&
-             (SMO.getSectionName() == "__cfstring" ||
-              SMO.getSectionName() == "__objc_classrefs" ||
-              SMO.getSectionName() == "__objc_catlist")));
-  }
-
   /// \brief Generate the compact unwind encoding from the CFI directives.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
@@ -534,8 +499,8 @@ void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   // store fixups in .eh_frame section in big endian order
   if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
     const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
-    const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
-    if (SecELF->getSectionName() == ".eh_frame")
+    const MCSectionELF *SecELF = dyn_cast_or_null<const MCSectionELF>(Sec);
+    if (SecELF && SecELF->getSectionName() == ".eh_frame")
       Value = ByteSwap_32(unsigned(Value));
   }
   AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
@@ -551,7 +516,8 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
     return new DarwinAArch64AsmBackend(T, MRI);
 
   assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
-  return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true);
 }
 
 MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
@@ -561,6 +527,7 @@ MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
 
   assert(TheTriple.isOSBinFormatELF() &&
          "Big endian is only supported for ELF targets!");
-  return new ELFAArch64AsmBackend(T, TheTriple.getOS(),
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  return new ELFAArch64AsmBackend(T, OSABI,
                                   /*IsLittleEndian=*/false);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index e05191e..5ea49c3 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -78,7 +78,7 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
         return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
       if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
-        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+        return ELF::R_AARCH64_TLSDESC_ADR_PAGE21;
       llvm_unreachable("invalid symbol kind for ADRP relocation");
     case AArch64::fixup_aarch64_pcrel_branch26:
       return ELF::R_AARCH64_JUMP26;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index a79406d..8dc6c30 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -15,8 +15,10 @@
 
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -34,12 +36,42 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 namespace {
 
+class AArch64ELFStreamer;
+
+class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
+  formatted_raw_ostream &OS;
+
+  void emitInst(uint32_t Inst) override;
+
+public:
+  AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+};
+
+AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S,
+                                                   formatted_raw_ostream &OS)
+  : AArch64TargetStreamer(S), OS(OS) {}
+
+void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) {
+  OS << "\t.inst\t0x" << utohexstr(Inst) << "\n";
+}
+
+class AArch64TargetELFStreamer : public AArch64TargetStreamer {
+private:
+  AArch64ELFStreamer &getStreamer();
+
+  void emitInst(uint32_t Inst) override;
+
+public:
+  AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
+};
+
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
 /// the appropriate points in the object files. These symbols are defined in the
 /// AArch64 ELF ABI:
@@ -55,6 +87,8 @@ namespace {
 /// by MachO. Beware!
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
+  friend class AArch64TargetELFStreamer;
+
   AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
                    MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
@@ -82,6 +116,18 @@ public:
     MCELFStreamer::EmitInstruction(Inst, STI);
   }
 
+  void emitInst(uint32_t Inst) {
+    char Buffer[4];
+    const bool LittleEndian = getContext().getAsmInfo()->isLittleEndian();
+
+    EmitA64MappingSymbol();
+    for (unsigned II = 0; II != 4; ++II) {
+      const unsigned I = LittleEndian ? (4 - II - 1) : II;
+      Buffer[4 - II - 1] = uint8_t(Inst >> I * CHAR_BIT);
+    }
+    MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
+  }
+
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
@@ -131,7 +177,9 @@ private:
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
+    auto Sec = getCurrentSection().first;
+    assert(Sec && "need a section");
+    Symbol->setSection(*Sec);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
@@ -144,17 +192,35 @@ private:
 
   /// @}
 };
+} // end anonymous namespace
+
+AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
+  return static_cast<AArch64ELFStreamer &>(Streamer);
+}
+
+void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
+  getStreamer().emitInst(Inst);
 }
 
 namespace llvm {
+MCStreamer *
+createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                           bool isVerboseAsm, bool useDwarfDirectory,
+                           MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                           MCAsmBackend *TAB, bool ShowInst) {
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
+  new AArch64TargetAsmStreamer(*S, OS);
+  return S;
+}
+
 MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                         raw_ostream &OS, MCCodeEmitter *Emitter,
-                                        bool RelaxAll, bool NoExecStack) {
+                                        bool RelaxAll) {
   AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+  new AArch64TargetELFStreamer(*S);
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
-  if (NoExecStack)
-    S->getAssembler().setNoExecStack(true);
   return S;
 }
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index bc6973b..71b05cc 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_ELF_STREAMER_H
-#define LLVM_AARCH64_ELF_STREAMER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
 
 #include "llvm/MC/MCELFStreamer.h"
 
@@ -20,7 +20,7 @@ namespace llvm {
 
 MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                         raw_ostream &OS, MCCodeEmitter *Emitter,
-                                        bool RelaxAll, bool NoExecStack);
+                                        bool RelaxAll);
 }
 
-#endif // AArch64_ELF_STREAMER_H
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index bf405fb..0f5b765 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AArch64FIXUPKINDS_H
-#define LLVM_AArch64FIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 1763b40..f048474 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -13,8 +13,8 @@
 
 #include "AArch64MCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
@@ -37,6 +37,7 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
   AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
 
   PrivateGlobalPrefix = "L";
+  PrivateLabelPrefix = "L";
   SeparatorString = "%%";
   CommentString = ";";
   PointerSize = CalleeSaveStackSlotSize = 8;
@@ -66,7 +67,7 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
 
 AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
   Triple T(TT);
-  if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be)
+  if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
 
   // We prefer NEON instructions to be printed in the short form.
@@ -79,6 +80,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
 
   CommentString = "//";
   PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
   Code32Directive = ".code\t32";
 
   Data16bitsDirective = "\t.hword\t";
@@ -89,7 +91,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
 
   WeakRefDirective = "\t.weak\t";
 
-  HasLEB128 = true;
   SupportsDebugInformation = true;
 
   // Exceptions handling
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 42a031d..5d03c21 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64TARGETASMINFO_H
-#define AArch64TARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index f051357..4756a192 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -15,13 +15,13 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -437,8 +437,7 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
     return 3;
   }
 
-  assert(false && "Invalid value for vector shift amount!");
-  return 0;
+  llvm_unreachable("Invalid value for vector shift amount!");
 }
 
 uint32_t
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 42a6787..e396df8 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -90,8 +90,9 @@ const MCSection *AArch64MCExpr::FindAssociatedSection() const {
 }
 
 bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                            const MCAsmLayout *Layout) const {
-  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+                                            const MCAsmLayout *Layout,
+					    const MCFixup *Fixup) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout, Fixup))
     return false;
 
   Res =
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 5422f9d..db48ac9 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AArch64MCEXPR_H
-#define LLVM_AArch64MCEXPR_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
 
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -152,7 +152,8 @@ public:
   const MCSection *FindAssociatedSection() const override;
 
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const override;
+                                 const MCAsmLayout *Layout,
+				 const MCFixup *Fixup) const override;
 
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index ae698c5..0f7a6b8 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -126,15 +126,14 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &TAB,
                                     raw_ostream &OS, MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI, bool RelaxAll,
-                                    bool NoExecStack) {
+                                    const MCSubtargetInfo &STI, bool RelaxAll) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin())
     return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
                                /*LabelSections*/ true);
 
-  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
 }
 
 // Force static initialization.
@@ -142,17 +141,14 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo);
   RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo);
-  RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo);
-  RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Z(TheARM64Target, createAArch64MCAsmInfo);
 
   // Register the MC codegen info.
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
                                         createAArch64MCCodeGenInfo);
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
                                         createAArch64MCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
-                                        createAArch64MCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target,
                                         createAArch64MCCodeGenInfo);
 
   // Register the MC instruction info.
@@ -160,9 +156,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
                                       createAArch64MCInstrInfo);
   TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
                                       createAArch64MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget,
-                                      createAArch64MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget,
+  TargetRegistry::RegisterMCInstrInfo(TheARM64Target,
                                       createAArch64MCInstrInfo);
 
   // Register the MC register info.
@@ -170,9 +164,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
                                     createAArch64MCRegisterInfo);
   TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
                                     createAArch64MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheARM64leTarget,
-                                    createAArch64MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheARM64beTarget,
+  TargetRegistry::RegisterMCRegInfo(TheARM64Target,
                                     createAArch64MCRegisterInfo);
 
   // Register the MC subtarget info.
@@ -180,9 +172,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
                                           createAArch64MCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
                                           createAArch64MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
-                                          createAArch64MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target,
                                           createAArch64MCSubtargetInfo);
 
   // Register the asm backend.
@@ -190,19 +180,15 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
                                        createAArch64leAsmBackend);
   TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
                                        createAArch64beAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget,
+  TargetRegistry::RegisterMCAsmBackend(TheARM64Target,
                                        createAArch64leAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget,
-                                       createAArch64beAsmBackend);
 
   // Register the MC Code Emitter
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
                                         createAArch64MCCodeEmitter);
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
                                         createAArch64MCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
-                                        createAArch64MCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64Target,
                                         createAArch64MCCodeEmitter);
 
   // Register the object streamer.
@@ -210,16 +196,21 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
                                            createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
                                            createMCStreamer);
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer);
+
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheAArch64leTarget,
+                                      createAArch64MCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheAArch64beTarget,
+                                      createAArch64MCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheARM64Target,
+                                      createAArch64MCAsmStreamer);
 
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
                                         createAArch64MCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
                                         createAArch64MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
-                                        createAArch64MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
+  TargetRegistry::RegisterMCInstPrinter(TheARM64Target,
                                         createAArch64MCInstPrinter);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index d886ea2..1553115 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -11,19 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64MCTARGETDESC_H
-#define AArch64MCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
 #include <string>
 
 namespace llvm {
+class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
+class MCInstPrinter;
 class MCRegisterInfo;
 class MCObjectWriter;
+class MCStreamer;
 class MCSubtargetInfo;
 class StringRef;
 class Target;
@@ -31,8 +34,7 @@ class raw_ostream;
 
 extern Target TheAArch64leTarget;
 extern Target TheAArch64beTarget;
-extern Target TheARM64leTarget;
-extern Target TheARM64beTarget;
+extern Target TheARM64Target;
 
 MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo &MRI,
@@ -51,6 +53,11 @@ MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
 MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
                                             uint32_t CPUSubtype);
 
+MCStreamer *
+createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                           bool isVerboseAsm, bool useDwarfDirectory,
+                           MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                           MCAsmBackend *TAB, bool ShowInst);
 } // End llvm namespace
 
 // Defines symbolic names for AArch64 registers.  This defines a mapping from
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index ba95366..f6fab5d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -9,15 +9,16 @@
 
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/MC/MCAssembler.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachO.h"
 using namespace llvm;
@@ -33,7 +34,7 @@ public:
       : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
                                  /*UseAggressiveSymbolFolding=*/true) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
@@ -112,8 +113,25 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
   }
 }
 
+static bool canUseLocalRelocation(const MCSectionMachO &Section,
+                                  const MCSymbol &Symbol, unsigned Log2Size) {
+  // Debug info sections can use local relocations.
+  if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+    return true;
+
+  // Otherwise, only pointer sized relocations are supported.
+  if (Log2Size != 3)
+    return false;
+
+  // But only if they don't point to a cstring.
+  if (!Symbol.isInSection())
+    return true;
+  const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
+  return RefSec.getType() != MachO::S_CSTRING_LITERALS;
+}
+
 void AArch64MachObjectWriter::RecordRelocation(
-    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
     const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
     uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
@@ -123,9 +141,9 @@ void AArch64MachObjectWriter::RecordRelocation(
   unsigned Log2Size = 0;
   int64_t Value = 0;
   unsigned Index = 0;
-  unsigned IsExtern = 0;
   unsigned Type = 0;
   unsigned Kind = Fixup.getKind();
+  const MCSymbolData *RelSymbol = nullptr;
 
   FixupOffset += Fixup.getOffset();
 
@@ -171,10 +189,8 @@ void AArch64MachObjectWriter::RecordRelocation(
     // FIXME: Should this always be extern?
     // SymbolNum of 0 indicates the absolute section.
     Type = MachO::ARM64_RELOC_UNSIGNED;
-    Index = 0;
 
     if (IsPCRel) {
-      IsExtern = 1;
       Asm.getContext().FatalError(Fixup.getLoc(),
                                   "PC relative absolute relocation!");
 
@@ -198,15 +214,12 @@ void AArch64MachObjectWriter::RecordRelocation(
         Layout.getSymbolOffset(&B_SD) ==
             Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
       // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
-      Index = A_Base->getIndex();
-      IsExtern = 1;
       Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
       IsPCRel = 1;
       MachO::any_relocation_info MRE;
       MRE.r_word0 = FixupOffset;
-      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                     (IsExtern << 27) | (Type << 28));
-      Writer->addRelocation(Fragment->getParent(), MRE);
+      MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+      Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
       return;
     } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
                Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
@@ -252,26 +265,31 @@ void AArch64MachObjectWriter::RecordRelocation(
                   ? 0
                   : Writer->getSymbolAddress(B_Base, Layout));
 
-    Index = A_Base->getIndex();
-    IsExtern = 1;
     Type = MachO::ARM64_RELOC_UNSIGNED;
 
     MachO::any_relocation_info MRE;
     MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                   (IsExtern << 27) | (Type << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
+    MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+    Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
 
-    Index = B_Base->getIndex();
-    IsExtern = 1;
+    RelSymbol = B_Base;
     Type = MachO::ARM64_RELOC_SUBTRACTOR;
   } else { // A + constant
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
-    const MCSymbolData *Base = Asm.getAtom(&SD);
     const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
         Fragment->getParent()->getSection());
 
+    bool CanUseLocalRelocation =
+        canUseLocalRelocation(Section, *Symbol, Log2Size);
+    if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) {
+      const MCSection &Sec = Symbol->getSection();
+      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+        Asm.addLocalUsedInReloc(*Symbol);
+    }
+
+    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+
     // If the symbol is a variable and we weren't able to get a Base for it
     // (i.e., it's not in the symbol table associated with a section) resolve
     // the relocation based its expansion instead.
@@ -288,7 +306,8 @@ void AArch64MachObjectWriter::RecordRelocation(
       // FIXME: Will the Target we already have ever have any data in it
       // we need to preserve and merge with the new Target? How about
       // the FixedValue?
-      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout,
+                                                             &Fixup))
         Asm.getContext().FatalError(Fixup.getLoc(),
                                     "unable to resolve variable '" +
                                         Symbol->getName() + "'");
@@ -309,16 +328,13 @@ void AArch64MachObjectWriter::RecordRelocation(
     // sections, and for pointer-sized relocations (.quad), we allow section
     // relocations.  It's code sections that run into trouble.
     if (Base) {
-      Index = Base->getIndex();
-      IsExtern = 1;
+      RelSymbol = Base;
 
       // Add the local offset, if needed.
       if (Base != &SD)
         Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
     } else if (Symbol->isInSection()) {
-      // Pointer-sized relocations can use a local relocation. Otherwise,
-      // we have to be in a debug info section.
-      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+      if (!CanUseLocalRelocation)
         Asm.getContext().FatalError(
             Fixup.getLoc(),
             "unsupported relocation of local symbol '" + Symbol->getName() +
@@ -328,7 +344,6 @@ void AArch64MachObjectWriter::RecordRelocation(
       const MCSectionData &SymSD =
           Asm.getSectionData(SD.getSymbol().getSection());
       Index = SymSD.getOrdinal() + 1;
-      IsExtern = 0;
       Value += Writer->getSymbolAddress(&SD, Layout);
 
       if (IsPCRel)
@@ -361,16 +376,16 @@ void AArch64MachObjectWriter::RecordRelocation(
 
     MachO::any_relocation_info MRE;
     MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                   (IsExtern << 27) | (Type << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
+    MRE.r_word1 =
+        (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+    Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 
     // Now set up the Addend relocation.
     Type = MachO::ARM64_RELOC_ADDEND;
     Index = Value;
+    RelSymbol = nullptr;
     IsPCRel = 0;
     Log2Size = 2;
-    IsExtern = 0;
 
     // Put zero into the instruction itself. The addend is in the relocation.
     Value = 0;
@@ -382,9 +397,9 @@ void AArch64MachObjectWriter::RecordRelocation(
   // struct relocation_info (8 bytes)
   MachO::any_relocation_info MRE;
   MRE.r_word0 = FixupOffset;
-  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                 (IsExtern << 27) | (Type << 28));
-  Writer->addRelocation(Fragment->getParent(), MRE);
+  MRE.r_word1 =
+      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
 MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index dcc1a3c..e3112fa 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -39,3 +39,5 @@ void AArch64TargetStreamer::emitCurrentConstantPool() {
 
 // finish() - write out any non-empty assembler constant pools.
 void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+
+void AArch64TargetStreamer::emitInst(uint32_t Inst) {}
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 3a382c1..f42ecb1 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -14,18 +14,19 @@ using namespace llvm;
 namespace llvm {
 Target TheAArch64leTarget;
 Target TheAArch64beTarget;
-Target TheARM64leTarget;
-Target TheARM64beTarget;
+Target TheARM64Target;
 } // end namespace llvm
 
 extern "C" void LLVMInitializeAArch64TargetInfo() {
-  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
-                                                   "AArch64 (little endian)");
-  RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
-                                                      "AArch64 (big endian)");
+  // Now register the "arm64" name for use with "-march". We don't want it to
+  // take possession of the Triple::aarch64 tag though.
+  TargetRegistry::RegisterTarget(TheARM64Target, "arm64",
+                                 "ARM64 (little endian)",
+                                 [](Triple::ArchType) { return false; }, true);
 
   RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
       TheAArch64leTarget, "aarch64", "AArch64 (little endian)");
   RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
       TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)");
+
 }
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 3c24bb3..bc6c7a9 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -791,22 +791,22 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
     }
   }
 
-  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits
-  // are: 11 xxx 1x11 xxxx xxx
-  Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$");
+  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
+  Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$");
 
-  SmallVector<StringRef, 4> Ops;
+  SmallVector<StringRef, 5> Ops;
   if (!GenericRegPattern.match(NameLower, &Ops)) {
     Valid = false;
     return -1;
   }
 
-  uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
+  uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
   uint32_t Bits;
-  Ops[1].getAsInteger(10, Op1);
-  Ops[2].getAsInteger(10, CRn);
-  Ops[3].getAsInteger(10, CRm);
-  Ops[4].getAsInteger(10, Op2);
+  Ops[1].getAsInteger(10, Op0);
+  Ops[2].getAsInteger(10, Op1);
+  Ops[3].getAsInteger(10, CRn);
+  Ops[4].getAsInteger(10, CRm);
+  Ops[5].getAsInteger(10, Op2);
   Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
 
   Valid = true;
@@ -814,11 +814,10 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
 }
 
 std::string
-AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits) const {
   // First search the registers shared by all
   for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
     if (SysRegPairs[i].Value == Bits) {
-      Valid = true;
       return SysRegPairs[i].Name;
     }
   }
@@ -827,7 +826,6 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
   if (FeatureBits & AArch64::ProcCyclone) {
     for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
       if (CycloneSysRegPairs[i].Value == Bits) {
-        Valid = true;
         return CycloneSysRegPairs[i].Name;
       }
     }
@@ -837,28 +835,18 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
   // write-only).
   for (unsigned i = 0; i < NumInstPairs; ++i) {
     if (InstPairs[i].Value == Bits) {
-      Valid = true;
       return InstPairs[i].Name;
     }
   }
 
+  assert(Bits < 0x10000);
   uint32_t Op0 = (Bits >> 14) & 0x3;
   uint32_t Op1 = (Bits >> 11) & 0x7;
   uint32_t CRn = (Bits >> 7) & 0xf;
   uint32_t CRm = (Bits >> 3) & 0xf;
   uint32_t Op2 = Bits & 0x7;
 
-  // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic
-  // name.
-  if (Op0 != 3 || (CRn != 11 && CRn != 15)) {
-      Valid = false;
-      return "";
-  }
-
-  assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg");
-
-  Valid = true;
-  return "s3_" + utostr(Op1) + "_c" + utostr(CRn)
+  return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn)
                + "_c" + utostr(CRm) + "_" + utostr(Op2);
 }
 
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 9d2ce21..c60b09a 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64BASEINFO_H
-#define AArch64BASEINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
+#define LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
 
 // FIXME: Is it easiest to fix this layering violation by moving the .inc
 // #includes from AArch64MCTargetDesc.h to here?
@@ -1143,7 +1143,7 @@ namespace AArch64SysReg {
 
     SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
     uint32_t fromString(StringRef Name, bool &Valid) const;
-    std::string toString(uint32_t Bits, bool &Valid) const;
+    std::string toString(uint32_t Bits) const;
   };
 
   struct MSRMapper : SysRegMapper {
@@ -1271,7 +1271,12 @@ namespace AArch64II {
     /// thread-local symbol. On Darwin, only one type of thread-local access
     /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
     /// referee will affect interpretation.
-    MO_TLS = 0x20
+    MO_TLS = 0x20,
+
+    /// MO_CONSTPOOL - This flag indicates that a symbol operand represents
+    /// the address of a constant pool entry for the symbol, rather than the
+    /// address of the symbol itself.
+    MO_CONSTPOOL = 0x40
   };
 } // end namespace AArch64II
author	dim <dim@FreeBSD.org>	2015-01-18 16:17:27 +0000
committer	dim <dim@FreeBSD.org>	2015-01-18 16:17:27 +0000
commit	081af4da16b9046c019ca40f64b1fb7ee8c6dca1 (patch)
tree	4abb9cbeecc7901726dd0b4a37369596c852e9ef /lib/Target/AArch64
parent	3c7e7a1538a873b0d3b012ef8811969ac4552c2a (diff)
download	FreeBSD-src-081af4da16b9046c019ca40f64b1fb7ee8c6dca1.zip FreeBSD-src-081af4da16b9046c019ca40f64b1fb7ee8c6dca1.tar.gz