1 files changed, 1325 insertions, 183 deletions
diff --git a/contrib/llvm/tools/clang/utils/TableGen/NeonEmitter.cpp b/contrib/llvm/tools/clang/utils/TableGen/NeonEmitter.cpp
index 34b955e..b0939c9 100644
--- a/contrib/llvm/tools/clang/utils/TableGen/NeonEmitter.cpp
+++ b/contrib/llvm/tools/clang/utils/TableGen/NeonEmitter.cpp
@@ -40,31 +40,58 @@ enum OpKind {
   OpUnavailable,
   OpAdd,
   OpAddl,
+  OpAddlHi,
   OpAddw,
+  OpAddwHi,
   OpSub,
   OpSubl,
+  OpSublHi,
   OpSubw,
+  OpSubwHi,
   OpMul,
   OpMla,
   OpMlal,
+  OpMullHi,
+  OpMullHiN,
+  OpMlalHi,
+  OpMlalHiN,
   OpMls,
   OpMlsl,
+  OpMlslHi,
+  OpMlslHiN,
   OpMulN,
   OpMlaN,
   OpMlsN,
+  OpFMlaN,
+  OpFMlsN,
   OpMlalN,
   OpMlslN,
   OpMulLane,
+  OpMulXLane,
   OpMullLane,
+  OpMullHiLane,
   OpMlaLane,
   OpMlsLane,
   OpMlalLane,
+  OpMlalHiLane,
   OpMlslLane,
+  OpMlslHiLane,
   OpQDMullLane,
+  OpQDMullHiLane,
   OpQDMlalLane,
+  OpQDMlalHiLane,
   OpQDMlslLane,
+  OpQDMlslHiLane,
   OpQDMulhLane,
   OpQRDMulhLane,
+  OpFMSLane,
+  OpFMSLaneQ,
+  OpTrn1,
+  OpZip1,
+  OpUzp1,
+  OpTrn2,
+  OpZip2,
+  OpUzp2,
   OpEq,
   OpGe,
   OpLe,
@@ -87,10 +114,49 @@ enum OpKind {
   OpRev16,
   OpRev32,
   OpRev64,
+  OpXtnHi,
+  OpSqxtunHi,
+  OpQxtnHi,
+  OpFcvtnHi,
+  OpFcvtlHi,
+  OpFcvtxnHi,
   OpReinterpret,
+  OpAddhnHi,
+  OpRAddhnHi,
+  OpSubhnHi,
+  OpRSubhnHi,
   OpAbdl,
+  OpAbdlHi,
   OpAba,
-  OpAbal
+  OpAbal,
+  OpAbalHi,
+  OpQDMullHi,
+  OpQDMullHiN,
+  OpQDMlalHi,
+  OpQDMlalHiN,
+  OpQDMlslHi,
+  OpQDMlslHiN,
+  OpDiv,
+  OpLongHi,
+  OpNarrowHi,
+  OpMovlHi,
+  OpCopyLane,
+  OpCopyQLane,
+  OpCopyLaneQ,
+  OpScalarMulLane,
+  OpScalarMulLaneQ,
+  OpScalarMulXLane,
+  OpScalarMulXLaneQ,
+  OpScalarVMulXLane,
+  OpScalarVMulXLaneQ,
+  OpScalarQDMullLane,
+  OpScalarQDMullLaneQ,
+  OpScalarQDMulHiLane,
+  OpScalarQDMulHiLaneQ,
+  OpScalarQRDMulHiLane,
+  OpScalarQRDMulHiLaneQ,
+  OpScalarGetLane,
+  OpScalarSetLane
 };
 
 enum ClassKind {
@@ -126,8 +192,10 @@ public:
     Int64,
     Poly8,
     Poly16,
+    Poly64,
     Float16,
-    Float32
+    Float32,
+    Float64
   };
 
   NeonTypeFlags(unsigned F) : Flags(F) {}
@@ -154,31 +222,58 @@ public:
     OpMap["OP_UNAVAILABLE"] = OpUnavailable;
     OpMap["OP_ADD"]   = OpAdd;
     OpMap["OP_ADDL"]  = OpAddl;
+    OpMap["OP_ADDLHi"] = OpAddlHi;
     OpMap["OP_ADDW"]  = OpAddw;
+    OpMap["OP_ADDWHi"] = OpAddwHi;
     OpMap["OP_SUB"]   = OpSub;
     OpMap["OP_SUBL"]  = OpSubl;
+    OpMap["OP_SUBLHi"] = OpSublHi;
     OpMap["OP_SUBW"]  = OpSubw;
+    OpMap["OP_SUBWHi"] = OpSubwHi;
     OpMap["OP_MUL"]   = OpMul;
     OpMap["OP_MLA"]   = OpMla;
     OpMap["OP_MLAL"]  = OpMlal;
+    OpMap["OP_MULLHi"]  = OpMullHi;
+    OpMap["OP_MULLHi_N"]  = OpMullHiN;
+    OpMap["OP_MLALHi"]  = OpMlalHi;
+    OpMap["OP_MLALHi_N"]  = OpMlalHiN;
     OpMap["OP_MLS"]   = OpMls;
     OpMap["OP_MLSL"]  = OpMlsl;
+    OpMap["OP_MLSLHi"] = OpMlslHi;
+    OpMap["OP_MLSLHi_N"] = OpMlslHiN;
     OpMap["OP_MUL_N"] = OpMulN;
     OpMap["OP_MLA_N"] = OpMlaN;
     OpMap["OP_MLS_N"] = OpMlsN;
+    OpMap["OP_FMLA_N"] = OpFMlaN;
+    OpMap["OP_FMLS_N"] = OpFMlsN;
     OpMap["OP_MLAL_N"] = OpMlalN;
     OpMap["OP_MLSL_N"] = OpMlslN;
     OpMap["OP_MUL_LN"]= OpMulLane;
+    OpMap["OP_MULX_LN"]= OpMulXLane;
     OpMap["OP_MULL_LN"] = OpMullLane;
+    OpMap["OP_MULLHi_LN"] = OpMullHiLane;
     OpMap["OP_MLA_LN"]= OpMlaLane;
     OpMap["OP_MLS_LN"]= OpMlsLane;
     OpMap["OP_MLAL_LN"] = OpMlalLane;
+    OpMap["OP_MLALHi_LN"] = OpMlalHiLane;
     OpMap["OP_MLSL_LN"] = OpMlslLane;
+    OpMap["OP_MLSLHi_LN"] = OpMlslHiLane;
     OpMap["OP_QDMULL_LN"] = OpQDMullLane;
+    OpMap["OP_QDMULLHi_LN"] = OpQDMullHiLane;
     OpMap["OP_QDMLAL_LN"] = OpQDMlalLane;
+    OpMap["OP_QDMLALHi_LN"] = OpQDMlalHiLane;
     OpMap["OP_QDMLSL_LN"] = OpQDMlslLane;
+    OpMap["OP_QDMLSLHi_LN"] = OpQDMlslHiLane;
     OpMap["OP_QDMULH_LN"] = OpQDMulhLane;
     OpMap["OP_QRDMULH_LN"] = OpQRDMulhLane;
+    OpMap["OP_FMS_LN"] = OpFMSLane;
+    OpMap["OP_FMS_LNQ"] = OpFMSLaneQ;
+    OpMap["OP_TRN1"]  = OpTrn1;
+    OpMap["OP_ZIP1"]  = OpZip1;
+    OpMap["OP_UZP1"]  = OpUzp1;
+    OpMap["OP_TRN2"]  = OpTrn2;
+    OpMap["OP_ZIP2"]  = OpZip2;
+    OpMap["OP_UZP2"]  = OpUzp2;
     OpMap["OP_EQ"]    = OpEq;
     OpMap["OP_GE"]    = OpGe;
     OpMap["OP_LE"]    = OpLe;
@@ -201,10 +296,49 @@ public:
     OpMap["OP_REV16"] = OpRev16;
     OpMap["OP_REV32"] = OpRev32;
     OpMap["OP_REV64"] = OpRev64;
+    OpMap["OP_XTN"] = OpXtnHi;
+    OpMap["OP_SQXTUN"] = OpSqxtunHi;
+    OpMap["OP_QXTN"] = OpQxtnHi;
+    OpMap["OP_VCVT_NA_HI"] = OpFcvtnHi;
+    OpMap["OP_VCVT_EX_HI"] = OpFcvtlHi;
+    OpMap["OP_VCVTX_HI"] = OpFcvtxnHi;
     OpMap["OP_REINT"] = OpReinterpret;
+    OpMap["OP_ADDHNHi"] = OpAddhnHi;
+    OpMap["OP_RADDHNHi"] = OpRAddhnHi;
+    OpMap["OP_SUBHNHi"] = OpSubhnHi;
+    OpMap["OP_RSUBHNHi"] = OpRSubhnHi;
     OpMap["OP_ABDL"]  = OpAbdl;
+    OpMap["OP_ABDLHi"] = OpAbdlHi;
     OpMap["OP_ABA"]   = OpAba;
     OpMap["OP_ABAL"]  = OpAbal;
+    OpMap["OP_ABALHi"] = OpAbalHi;
+    OpMap["OP_QDMULLHi"] = OpQDMullHi;
+    OpMap["OP_QDMULLHi_N"] = OpQDMullHiN;
+    OpMap["OP_QDMLALHi"] = OpQDMlalHi;
+    OpMap["OP_QDMLALHi_N"] = OpQDMlalHiN;
+    OpMap["OP_QDMLSLHi"] = OpQDMlslHi;
+    OpMap["OP_QDMLSLHi_N"] = OpQDMlslHiN;
+    OpMap["OP_DIV"] = OpDiv;
+    OpMap["OP_LONG_HI"] = OpLongHi;
+    OpMap["OP_NARROW_HI"] = OpNarrowHi;
+    OpMap["OP_MOVL_HI"] = OpMovlHi;
+    OpMap["OP_COPY_LN"] = OpCopyLane;
+    OpMap["OP_COPYQ_LN"] = OpCopyQLane;
+    OpMap["OP_COPY_LNQ"] = OpCopyLaneQ;
+    OpMap["OP_SCALAR_MUL_LN"]= OpScalarMulLane;
+    OpMap["OP_SCALAR_MUL_LNQ"]= OpScalarMulLaneQ;
+    OpMap["OP_SCALAR_MULX_LN"]= OpScalarMulXLane;
+    OpMap["OP_SCALAR_MULX_LNQ"]= OpScalarMulXLaneQ;
+    OpMap["OP_SCALAR_VMULX_LN"]= OpScalarVMulXLane;
+    OpMap["OP_SCALAR_VMULX_LNQ"]= OpScalarVMulXLaneQ;
+    OpMap["OP_SCALAR_QDMULL_LN"] = OpScalarQDMullLane;
+    OpMap["OP_SCALAR_QDMULL_LNQ"] = OpScalarQDMullLaneQ;
+    OpMap["OP_SCALAR_QDMULH_LN"] = OpScalarQDMulHiLane;
+    OpMap["OP_SCALAR_QDMULH_LNQ"] = OpScalarQDMulHiLaneQ;
+    OpMap["OP_SCALAR_QRDMULH_LN"] = OpScalarQRDMulHiLane;
+    OpMap["OP_SCALAR_QRDMULH_LNQ"] = OpScalarQRDMulHiLaneQ;
+    OpMap["OP_SCALAR_GET_LN"] = OpScalarGetLane;
+    OpMap["OP_SCALAR_SET_LN"] = OpScalarSetLane;
 
     Record *SI = R.getClass("SInst");
     Record *II = R.getClass("IInst");
@@ -235,7 +369,18 @@ public:
   void runTests(raw_ostream &o);
 
 private:
-  void emitIntrinsic(raw_ostream &OS, Record *R);
+  void emitIntrinsic(raw_ostream &OS, Record *R,
+                     StringMap<ClassKind> &EmittedMap);
+  void genBuiltinsDef(raw_ostream &OS, StringMap<ClassKind> &A64IntrinsicMap,
+                      bool isA64GenBuiltinDef);
+  void genOverloadTypeCheckCode(raw_ostream &OS,
+                                StringMap<ClassKind> &A64IntrinsicMap,
+                                bool isA64TypeCheck);
+  void genIntrinsicRangeCheckCode(raw_ostream &OS,
+                                  StringMap<ClassKind> &A64IntrinsicMap,
+                                  bool isA64RangeCheck);
+  void genTargetTest(raw_ostream &OS, StringMap<OpKind> &EmittedMap,
+                     bool isA64TestGen);
 };
 } // end anonymous namespace
 
@@ -249,7 +394,8 @@ static void ParseTypes(Record *r, std::string &s,
   int len = 0;
 
   for (unsigned i = 0, e = s.size(); i != e; ++i, ++len) {
-    if (data[len] == 'P' || data[len] == 'Q' || data[len] == 'U')
+    if (data[len] == 'P' || data[len] == 'Q' || data[len] == 'U'
+                         || data[len] == 'H' || data[len] == 'S')
       continue;
 
     switch (data[len]) {
@@ -259,6 +405,7 @@ static void ParseTypes(Record *r, std::string &s,
       case 'l':
       case 'h':
       case 'f':
+      case 'd':
         break;
       default:
         PrintFatalError(r->getLoc(),
@@ -282,6 +429,8 @@ static char Widen(const char t) {
       return 'l';
     case 'h':
       return 'f';
+    case 'f':
+      return 'd';
     default:
       PrintFatalError("unhandled type in widen!");
   }
@@ -299,18 +448,46 @@ static char Narrow(const char t) {
       return 'i';
     case 'f':
       return 'h';
+    case 'd':
+      return 'f';
     default:
       PrintFatalError("unhandled type in narrow!");
   }
 }
 
+static std::string GetNarrowTypestr(StringRef ty)
+{
+  std::string s;
+  for (size_t i = 0, end = ty.size(); i < end; i++) {
+    switch (ty[i]) {
+      case 's':
+        s += 'c';
+        break;
+      case 'i':
+        s += 's';
+        break;
+      case 'l':
+        s += 'i';
+        break;
+      default:
+        s += ty[i];
+        break;
+    }
+  }
+
+  return s;
+}
+
 /// For a particular StringRef, return the base type code, and whether it has
 /// the quad-vector, polynomial, or unsigned modifiers set.
 static char ClassifyType(StringRef ty, bool &quad, bool &poly, bool &usgn) {
   unsigned off = 0;
-
+  // ignore scalar.
+  if (ty[off] == 'S') {
+    ++off;
+  }
   // remember quad.
-  if (ty[off] == 'Q') {
+  if (ty[off] == 'Q' || ty[off] == 'H') {
     quad = true;
     ++off;
   }
@@ -342,27 +519,52 @@ static char ModType(const char mod, char type, bool &quad, bool &poly,
         usgn = true;
       }
       break;
+    case 'b':
+      scal = true;
     case 'u':
       usgn = true;
       poly = false;
       if (type == 'f')
         type = 'i';
+      if (type == 'd')
+        type = 'l';
       break;
+    case '$':
+      scal = true;
     case 'x':
       usgn = false;
       poly = false;
       if (type == 'f')
         type = 'i';
+      if (type == 'd')
+        type = 'l';
       break;
+    case 'o':
+      scal = true;
+      type = 'd';
+      usgn = false;
+      break;
+    case 'y':
+      scal = true;
     case 'f':
       if (type == 'h')
         quad = true;
       type = 'f';
       usgn = false;
       break;
+    case 'F':
+      type = 'd';
+      usgn = false;
+      break;
     case 'g':
       quad = false;
       break;
+    case 'B':
+    case 'C':
+    case 'D':
+    case 'j':
+      quad = true;
+      break;
     case 'w':
       type = Widen(type);
       quad = true;
@@ -379,6 +581,14 @@ static char ModType(const char mod, char type, bool &quad, bool &poly,
       scal = true;
       usgn = true;
       break;
+    case 'z':
+      type = Narrow(type);
+      scal = true;
+      break;
+    case 'r':
+      type = Widen(type);
+      scal = true;
+      break;
     case 's':
     case 'a':
       scal = true;
@@ -397,16 +607,28 @@ static char ModType(const char mod, char type, bool &quad, bool &poly,
       if (type == 'h')
         quad = false;
       break;
+    case 'q':
+      type = Narrow(type);
+      quad = true;
+      break;
     case 'e':
       type = Narrow(type);
       usgn = true;
       break;
+    case 'm':
+      type = Narrow(type);
+      quad = false;
+      break;
     default:
       break;
   }
   return type;
 }
 
+static bool IsMultiVecProto(const char p) {
+  return ((p >= '2' && p <= '4') || (p >= 'B' && p <= 'D'));
+}
+
 /// TypeString - for a modifier and type, generate the name of the typedef for
 /// that type.  QUc -> uint8x8_t.
 static std::string TypeString(const char mod, StringRef typestr) {
@@ -453,7 +675,7 @@ static std::string TypeString(const char mod, StringRef typestr) {
       s += quad ? "x4" : "x2";
       break;
     case 'l':
-      s += "int64";
+      s += (poly && !usgn)? "poly64" : "int64";
       if (scal)
         break;
       s += quad ? "x2" : "x1";
@@ -470,15 +692,22 @@ static std::string TypeString(const char mod, StringRef typestr) {
         break;
       s += quad ? "x4" : "x2";
       break;
+    case 'd':
+      s += "float64";
+      if (scal)
+        break;
+      s += quad ? "x2" : "x1";
+      break;
+
     default:
       PrintFatalError("unhandled type!");
   }
 
-  if (mod == '2')
+  if (mod == '2' || mod == 'B')
     s += "x2";
-  if (mod == '3')
+  if (mod == '3' || mod == 'C')
     s += "x3";
-  if (mod == '4')
+  if (mod == '4' || mod == 'D')
     s += "x4";
 
   // Append _t, finishing the type string typedef type.
@@ -527,7 +756,8 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr,
     type = 's';
     usgn = true;
   }
-  usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && scal && type != 'f');
+  usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
+                         scal && type != 'f' && type != 'd');
 
   if (scal) {
     SmallString<128> s;
@@ -554,10 +784,12 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr,
   // returning structs of 2, 3, or 4 vectors which are returned in a sret-like
   // fashion, storing them to a pointer arg.
   if (ret) {
-    if (mod >= '2' && mod <= '4')
+    if (IsMultiVecProto(mod))
       return "vv*"; // void result with void* first argument
     if (mod == 'f' || (ck != ClassB && type == 'f'))
       return quad ? "V4f" : "V2f";
+    if (mod == 'F' || (ck != ClassB && type == 'd'))
+      return quad ? "V2d" : "V1d";
     if (ck != ClassB && type == 's')
       return quad ? "V8s" : "V4s";
     if (ck != ClassB && type == 'i')
@@ -569,15 +801,17 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr,
   }
 
   // Non-return array types are passed as individual vectors.
-  if (mod == '2')
+  if (mod == '2' || mod == 'B')
     return quad ? "V16ScV16Sc" : "V8ScV8Sc";
-  if (mod == '3')
+  if (mod == '3' || mod == 'C')
     return quad ? "V16ScV16ScV16Sc" : "V8ScV8ScV8Sc";
-  if (mod == '4')
+  if (mod == '4' || mod == 'D')
     return quad ? "V16ScV16ScV16ScV16Sc" : "V8ScV8ScV8ScV8Sc";
 
   if (mod == 'f' || (ck != ClassB && type == 'f'))
     return quad ? "V4f" : "V2f";
+  if (mod == 'F' || (ck != ClassB && type == 'd'))
+    return quad ? "V2d" : "V1d";
   if (ck != ClassB && type == 's')
     return quad ? "V8s" : "V4s";
   if (ck != ClassB && type == 'i')
@@ -625,7 +859,7 @@ static void InstructionTypeCode(const StringRef &typeStr,
     break;
   case 'l':
     switch (ck) {
-    case ClassS: typeCode = usgn ? "u64" : "s64"; break;
+    case ClassS: typeCode = poly ? "p64" : usgn ? "u64" : "s64"; break;
     case ClassI: typeCode = "i64"; break;
     case ClassW: typeCode = "64"; break;
     default: break;
@@ -647,17 +881,60 @@ static void InstructionTypeCode(const StringRef &typeStr,
     default: break;
     }
     break;
+  case 'd':
+    switch (ck) {
+    case ClassS:
+    case ClassI:
+      typeCode += "f64";
+      break;
+    case ClassW:
+      PrintFatalError("unhandled type!");
+    default:
+      break;
+    }
+    break;
   default:
     PrintFatalError("unhandled type!");
   }
 }
 
+static char Insert_BHSD_Suffix(StringRef typestr){
+  unsigned off = 0;
+  if(typestr[off++] == 'S'){
+    while(typestr[off] == 'Q' || typestr[off] == 'H'||
+          typestr[off] == 'P' || typestr[off] == 'U')
+      ++off;
+    switch (typestr[off]){
+    default  : break;
+    case 'c' : return 'b';
+    case 's' : return 'h';
+    case 'i' :
+    case 'f' : return 's';
+    case 'l' :
+    case 'd' : return 'd';
+    }
+  }
+  return 0;
+}
+
+static bool endsWith_xN(std::string const &name) {
+  if (name.length() > 3) {
+    if (name.compare(name.length() - 3, 3, "_x2") == 0 ||
+        name.compare(name.length() - 3, 3, "_x3") == 0 ||
+        name.compare(name.length() - 3, 3, "_x4") == 0)
+      return true;
+  }
+  return false;
+}
+
 /// MangleName - Append a type or width suffix to a base neon function name,
-/// and insert a 'q' in the appropriate location if the operation works on
-/// 128b rather than 64b.   E.g. turn "vst2_lane" into "vst2q_lane_f32", etc.
+/// and insert a 'q' in the appropriate location if type string starts with 'Q'.
+/// E.g. turn "vst2_lane" into "vst2q_lane_f32", etc.
+/// Insert proper 'b' 'h' 's' 'd' if prefix 'S' is used.
 static std::string MangleName(const std::string &name, StringRef typestr,
                               ClassKind ck) {
-  if (name == "vcvt_f32_f16")
+  if (name == "vcvt_f32_f16" || name == "vcvt_f32_f64" ||
+      name == "vcvt_f64_f32")
     return name;
 
   bool quad = false;
@@ -668,7 +945,11 @@ static std::string MangleName(const std::string &name, StringRef typestr,
   std::string s = name;
 
   if (typeCode.size() > 0) {
-    s += "_" + typeCode;
+    // If the name is end with _xN (N = 2,3,4), insert the typeCode before _xN.
+    if (endsWith_xN(s))
+      s.insert(s.length() - 3, "_" + typeCode);
+    else
+      s += "_" + typeCode;
   }
 
   if (ck == ClassB)
@@ -676,9 +957,14 @@ static std::string MangleName(const std::string &name, StringRef typestr,
 
   // Insert a 'q' before the first '_' character so that it ends up before
   // _lane or _n on vector-scalar operations.
-  if (quad) {
+  if (typestr.find("Q") != StringRef::npos) {
+      size_t pos = s.find('_');
+      s = s.insert(pos, "q");
+  }
+  char ins = Insert_BHSD_Suffix(typestr);
+  if(ins){
     size_t pos = s.find('_');
-    s = s.insert(pos, "q");
+    s = s.insert(pos, &ins, 1);
   }
 
   return s;
@@ -770,9 +1056,7 @@ GenerateRegisterCheckPatternForLoadStores(const StringRef &NameRef,
     // a dup/lane instruction.
     if (IsLDSTOne) {
       if ((HasLanePostfix || HasDupPostfix) && OutTypeCode != "8") {
-        RegisterSuffix += ", :" + OutTypeCode;
-      } else if (OutTypeCode == "64") {
-        RegisterSuffix += ", :64";
+        RegisterSuffix += ":" + OutTypeCode;
       }
     }
 
@@ -828,6 +1112,7 @@ static void NormalizeProtoForRegisterPatternCreation(const std::string &Name,
     switch (Proto[i]) {
     case 'u':
     case 'f':
+    case 'F':
     case 'd':
     case 's':
     case 'x':
@@ -840,6 +1125,7 @@ static void NormalizeProtoForRegisterPatternCreation(const std::string &Name,
       NormedProto += 'q';
       break;
     case 'g':
+    case 'j':
     case 'h':
     case 'e':
       NormedProto += 'd';
@@ -1158,7 +1444,8 @@ static bool MacroArgUsedDirectly(const std::string &proto, unsigned i) {
 }
 
 // Generate the string "(argtype a, argtype b, ...)"
-static std::string GenArgs(const std::string &proto, StringRef typestr) {
+static std::string GenArgs(const std::string &proto, StringRef typestr,
+                           const std::string &name) {
   bool define = UseMacro(proto);
   char arg = 'a';
 
@@ -1176,6 +1463,9 @@ static std::string GenArgs(const std::string &proto, StringRef typestr) {
       s += TypeString(proto[i], typestr) + " __";
     }
     s.push_back(arg);
+    //To avoid argument being multiple defined, add extra number for renaming.
+    if (name == "vcopy_lane" || name == "vcopy_laneq")
+      s.push_back('1');
     if ((i + 1) < e)
       s += ", ";
   }
@@ -1186,7 +1476,8 @@ static std::string GenArgs(const std::string &proto, StringRef typestr) {
 
 // Macro arguments are not type-checked like inline function arguments, so
 // assign them to local temporaries to get the right type checking.
-static std::string GenMacroLocals(const std::string &proto, StringRef typestr) {
+static std::string GenMacroLocals(const std::string &proto, StringRef typestr,
+                                  const std::string &name ) {
   char arg = 'a';
   std::string s;
   bool generatedLocal = false;
@@ -1197,11 +1488,18 @@ static std::string GenMacroLocals(const std::string &proto, StringRef typestr) {
     if (MacroArgUsedDirectly(proto, i))
       continue;
     generatedLocal = true;
+    bool extranumber = false;
+    if (name == "vcopy_lane" || name == "vcopy_laneq")
+      extranumber = true;
 
     s += TypeString(proto[i], typestr) + " __";
     s.push_back(arg);
+    if(extranumber)
+      s.push_back('1');
     s += " = (";
     s.push_back(arg);
+    if(extranumber)
+      s.push_back('1');
     s += "); ";
   }
 
@@ -1211,13 +1509,60 @@ static std::string GenMacroLocals(const std::string &proto, StringRef typestr) {
 }
 
 // Use the vmovl builtin to sign-extend or zero-extend a vector.
-static std::string Extend(StringRef typestr, const std::string &a) {
+static std::string Extend(StringRef typestr, const std::string &a, bool h=0) {
+  std::string s, high;
+  high = h ? "_high" : "";
+  s = MangleName("vmovl" + high, typestr, ClassS);
+  s += "(" + a + ")";
+  return s;
+}
+
+// Get the high 64-bit part of a vector
+static std::string GetHigh(const std::string &a, StringRef typestr) {
   std::string s;
-  s = MangleName("vmovl", typestr, ClassS);
+  s = MangleName("vget_high", typestr, ClassS);
   s += "(" + a + ")";
   return s;
 }
 
+// Gen operation with two operands and get high 64-bit for both of two operands.
+static std::string Gen2OpWith2High(StringRef typestr,
+                                   const std::string &op,
+                                   const std::string &a,
+                                   const std::string &b) {
+  std::string s;
+  std::string Op1 = GetHigh(a, typestr);
+  std::string Op2 = GetHigh(b, typestr);
+  s = MangleName(op, typestr, ClassS);
+  s += "(" + Op1 + ", " + Op2 + ");";
+  return s;
+}
+
+// Gen operation with three operands and get high 64-bit of the latter 
+// two operands.
+static std::string Gen3OpWith2High(StringRef typestr,
+                                   const std::string &op,
+                                   const std::string &a,
+                                   const std::string &b,
+                                   const std::string &c) {
+  std::string s;
+  std::string Op1 = GetHigh(b, typestr);
+  std::string Op2 = GetHigh(c, typestr);
+  s = MangleName(op, typestr, ClassS);
+  s += "(" + a + ", " + Op1 + ", " + Op2 + ");";
+  return s;
+}
+
+// Gen combine operation by putting a on low 64-bit, and b on high 64-bit.
+static std::string GenCombine(std::string typestr,
+                              const std::string &a,
+                              const std::string &b) {
+  std::string s;
+  s = MangleName("vcombine", typestr, ClassS);
+  s += "(" + a + ", " + b + ")";
+  return s;
+}
+
 static std::string Duplicate(unsigned nElts, StringRef typestr,
                              const std::string &a) {
   std::string s;
@@ -1242,6 +1587,15 @@ static std::string SplatLane(unsigned nElts, const std::string &vec,
   return s;
 }
 
+static std::string RemoveHigh(const std::string &name) {
+  std::string s = name;
+  std::size_t found = s.find("_high_");
+  if (found == std::string::npos)
+    PrintFatalError("name should contain \"_high_\" for high intrinsics");
+  s.replace(found, 5, "");
+  return s;
+}
+
 static unsigned GetNumElements(StringRef typestr, bool &quad) {
   quad = false;
   bool dummy = false;
@@ -1254,6 +1608,9 @@ static unsigned GetNumElements(StringRef typestr, bool &quad) {
   case 'l': nElts = 1; break;
   case 'h': nElts = 4; break;
   case 'f': nElts = 2; break;
+  case 'd':
+    nElts = 1;
+    break;
   default:
     PrintFatalError("unhandled type!");
   }
@@ -1262,8 +1619,8 @@ static unsigned GetNumElements(StringRef typestr, bool &quad) {
 }
 
 // Generate the definition for this intrinsic, e.g. "a + b" for OpAdd.
-static std::string GenOpString(OpKind op, const std::string &proto,
-                               StringRef typestr) {
+static std::string GenOpString(const std::string &name, OpKind op,
+                               const std::string &proto, StringRef typestr) {
   bool quad;
   unsigned nElts = GetNumElements(typestr, quad);
   bool define = UseMacro(proto);
@@ -1281,31 +1638,59 @@ static std::string GenOpString(OpKind op, const std::string &proto,
   case OpAddl:
     s += Extend(typestr, "__a") + " + " + Extend(typestr, "__b") + ";";
     break;
+  case OpAddlHi:
+    s += Extend(typestr, "__a", 1) + " + " + Extend(typestr, "__b", 1) + ";";
+    break;
   case OpAddw:
     s += "__a + " + Extend(typestr, "__b") + ";";
     break;
+  case OpAddwHi:
+    s += "__a + " + Extend(typestr, "__b", 1) + ";";
+    break;
   case OpSub:
     s += "__a - __b;";
     break;
   case OpSubl:
     s += Extend(typestr, "__a") + " - " + Extend(typestr, "__b") + ";";
     break;
+  case OpSublHi:
+    s += Extend(typestr, "__a", 1) + " - " + Extend(typestr, "__b", 1) + ";";
+    break;
   case OpSubw:
     s += "__a - " + Extend(typestr, "__b") + ";";
     break;
+  case OpSubwHi:
+    s += "__a - " + Extend(typestr, "__b", 1) + ";";
+    break;
   case OpMulN:
     s += "__a * " + Duplicate(nElts, typestr, "__b") + ";";
     break;
   case OpMulLane:
     s += "__a * " + SplatLane(nElts, "__b", "__c") + ";";
     break;
+  case OpMulXLane:
+    s += MangleName("vmulx", typestr, ClassS) + "(__a, " +
+      SplatLane(nElts, "__b", "__c") + ");";
+    break;
   case OpMul:
     s += "__a * __b;";
     break;
+  case OpFMlaN:
+    s += MangleName("vfma", typestr, ClassS);
+    s += "(__a, __b, " + Duplicate(nElts,typestr, "__c") + ");";
+    break;
+  case OpFMlsN:
+    s += MangleName("vfms", typestr, ClassS);
+    s += "(__a, __b, " + Duplicate(nElts,typestr, "__c") + ");";
+    break;
   case OpMullLane:
     s += MangleName("vmull", typestr, ClassS) + "(__a, " +
       SplatLane(nElts, "__b", "__c") + ");";
     break;
+  case OpMullHiLane:
+    s += MangleName("vmull", typestr, ClassS) + "(" +
+      GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
+    break;
   case OpMlaN:
     s += "__a + (__b * " + Duplicate(nElts, typestr, "__c") + ");";
     break;
@@ -1323,15 +1708,45 @@ static std::string GenOpString(OpKind op, const std::string &proto,
     s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " +
       SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpMlalHiLane:
+    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(" +
+      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+    break;
   case OpMlal:
     s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
     break;
+  case OpMullHi:
+    s += Gen2OpWith2High(typestr, "vmull", "__a", "__b");
+    break;
+  case OpMullHiN:
+    s += MangleName("vmull_n", typestr, ClassS);
+    s += "(" + GetHigh("__a", typestr) + ", __b);";
+    return s;
+  case OpMlalHi:
+    s += Gen3OpWith2High(typestr, "vmlal", "__a", "__b", "__c");
+    break;
+  case OpMlalHiN:
+    s += MangleName("vmlal_n", typestr, ClassS);
+    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
+    return s;
   case OpMlsN:
     s += "__a - (__b * " + Duplicate(nElts, typestr, "__c") + ");";
     break;
   case OpMlsLane:
     s += "__a - (__b * " + SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpFMSLane:
+    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
+    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
+    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
+    s += MangleName("vfma_lane", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
+    break;
+  case OpFMSLaneQ:
+    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
+    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
+    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
+    s += MangleName("vfma_laneq", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
+    break;
   case OpMls:
     s += "__a - (__b * __c);";
     break;
@@ -1343,21 +1758,44 @@ static std::string GenOpString(OpKind op, const std::string &proto,
     s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " +
       SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpMlslHiLane:
+    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(" +
+      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+    break;
   case OpMlsl:
     s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
     break;
+  case OpMlslHi:
+    s += Gen3OpWith2High(typestr, "vmlsl", "__a", "__b", "__c");
+    break;
+  case OpMlslHiN:
+    s += MangleName("vmlsl_n", typestr, ClassS);
+    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
+    break;
   case OpQDMullLane:
     s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
       SplatLane(nElts, "__b", "__c") + ");";
     break;
+  case OpQDMullHiLane:
+    s += MangleName("vqdmull", typestr, ClassS) + "(" +
+      GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
+    break;
   case OpQDMlalLane:
     s += MangleName("vqdmlal", typestr, ClassS) + "(__a, __b, " +
       SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpQDMlalHiLane:
+    s += MangleName("vqdmlal", typestr, ClassS) + "(__a, " +
+      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+    break;
   case OpQDMlslLane:
     s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, __b, " +
       SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpQDMlslHiLane:
+    s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, " +
+      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+    break;
   case OpQDMulhLane:
     s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " +
       SplatLane(nElts, "__b", "__c") + ");";
@@ -1410,12 +1848,17 @@ static std::string GenOpString(OpKind op, const std::string &proto,
     s += ", (int64x1_t)__b, 0, 1);";
     break;
   case OpHi:
-    s += "(" + ts +
-      ")__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1);";
+    // nElts is for the result vector, so the source is twice that number.
+    s += "__builtin_shufflevector(__a, __a";
+    for (unsigned i = nElts; i < nElts * 2; ++i)
+      s += ", " + utostr(i);
+    s+= ");";
     break;
   case OpLo:
-    s += "(" + ts +
-      ")__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0);";
+    s += "__builtin_shufflevector(__a, __a";
+    for (unsigned i = 0; i < nElts; ++i)
+      s += ", " + utostr(i);
+    s+= ");";
     break;
   case OpDup:
     s += Duplicate(nElts, typestr, "__a") + ";";
@@ -1455,6 +1898,94 @@ static std::string GenOpString(OpKind op, const std::string &proto,
     s += ");";
     break;
   }
+  case OpXtnHi: {
+    s = TypeString(proto[1], typestr) + " __a1 = " +
+        MangleName("vmovn", typestr, ClassS) + "(__b);\n  " +
+        "return __builtin_shufflevector(__a, __a1";
+    for (unsigned i = 0; i < nElts * 4; ++i)
+      s += ", " + utostr(i);
+    s += ");";
+    break;
+  }
+  case OpSqxtunHi: {
+    s = TypeString(proto[1], typestr) + " __a1 = " +
+        MangleName("vqmovun", typestr, ClassS) + "(__b);\n  " +
+        "return __builtin_shufflevector(__a, __a1";
+    for (unsigned i = 0; i < nElts * 4; ++i)
+      s += ", " + utostr(i);
+    s += ");";
+    break;
+  }
+  case OpQxtnHi: {
+    s = TypeString(proto[1], typestr) + " __a1 = " +
+        MangleName("vqmovn", typestr, ClassS) + "(__b);\n  " +
+        "return __builtin_shufflevector(__a, __a1";
+    for (unsigned i = 0; i < nElts * 4; ++i)
+      s += ", " + utostr(i);
+    s += ");";
+    break;
+  }
+  case OpFcvtnHi: {
+    std::string FName = (nElts == 1) ? "vcvt_f32" : "vcvt_f16";
+    s = TypeString(proto[1], typestr) + " __a1 = " +
+        MangleName(FName, typestr, ClassS) + "(__b);\n  " +
+        "return __builtin_shufflevector(__a, __a1";
+    for (unsigned i = 0; i < nElts * 4; ++i)
+      s += ", " + utostr(i);
+    s += ");";
+    break;
+  }
+  case OpFcvtlHi: {
+    std::string FName = (nElts == 2) ? "vcvt_f64" : "vcvt_f32";
+    s = TypeString('d', typestr) + " __a1 = " + GetHigh("__a", typestr) +
+        ";\n  return " + MangleName(FName, typestr, ClassS) + "(__a1);";
+    break;
+  }
+  case OpFcvtxnHi: {
+    s = TypeString(proto[1], typestr) + " __a1 = " +
+        MangleName("vcvtx_f32", typestr, ClassS) + "(__b);\n  " +
+        "return __builtin_shufflevector(__a, __a1";
+    for (unsigned i = 0; i < nElts * 4; ++i)
+      s += ", " + utostr(i);
+    s += ");";
+    break;
+  }
+  case OpUzp1:
+    s += "__builtin_shufflevector(__a, __b";
+    for (unsigned i = 0; i < nElts; i++)
+      s += ", " + utostr(2*i);
+    s += ");";
+    break;
+  case OpUzp2:
+    s += "__builtin_shufflevector(__a, __b";
+    for (unsigned i = 0; i < nElts; i++)
+      s += ", " + utostr(2*i+1);
+    s += ");";
+    break;
+  case OpZip1:
+    s += "__builtin_shufflevector(__a, __b";
+    for (unsigned i = 0; i < (nElts/2); i++)
+       s += ", " + utostr(i) + ", " + utostr(i+nElts);
+    s += ");";
+    break;
+  case OpZip2:
+    s += "__builtin_shufflevector(__a, __b";
+    for (unsigned i = nElts/2; i < nElts; i++)
+       s += ", " + utostr(i) + ", " + utostr(i+nElts);
+    s += ");";
+    break;
+  case OpTrn1:
+    s += "__builtin_shufflevector(__a, __b";
+    for (unsigned i = 0; i < (nElts/2); i++)
+       s += ", " + utostr(2*i) + ", " + utostr(2*i+nElts);
+    s += ");";
+    break;
+  case OpTrn2:
+    s += "__builtin_shufflevector(__a, __b";
+    for (unsigned i = 0; i < (nElts/2); i++)
+       s += ", " + utostr(2*i+1) + ", " + utostr(2*i+1+nElts);
+    s += ");";
+    break;
   case OpAbdl: {
     std::string abd = MangleName("vabd", typestr, ClassS) + "(__a, __b)";
     if (typestr[0] != 'U') {
@@ -1468,23 +1999,247 @@ static std::string GenOpString(OpKind op, const std::string &proto,
     }
     break;
   }
+  case OpAbdlHi:
+    s += Gen2OpWith2High(typestr, "vabdl", "__a", "__b");
+    break;
+  case OpAddhnHi: {
+    std::string addhn = MangleName("vaddhn", typestr, ClassS) + "(__b, __c)";
+    s += GenCombine(GetNarrowTypestr(typestr), "__a", addhn);
+    s += ";";
+    break;
+  }
+  case OpRAddhnHi: {
+    std::string raddhn = MangleName("vraddhn", typestr, ClassS) + "(__b, __c)";
+    s += GenCombine(GetNarrowTypestr(typestr), "__a", raddhn);
+    s += ";";
+    break;
+  }
+  case OpSubhnHi: {
+    std::string subhn = MangleName("vsubhn", typestr, ClassS) + "(__b, __c)";
+    s += GenCombine(GetNarrowTypestr(typestr), "__a", subhn);
+    s += ";";
+    break;
+  }
+  case OpRSubhnHi: {
+    std::string rsubhn = MangleName("vrsubhn", typestr, ClassS) + "(__b, __c)";
+    s += GenCombine(GetNarrowTypestr(typestr), "__a", rsubhn);
+    s += ";";
+    break;
+  }
   case OpAba:
     s += "__a + " + MangleName("vabd", typestr, ClassS) + "(__b, __c);";
     break;
-  case OpAbal: {
-    s += "__a + ";
-    std::string abd = MangleName("vabd", typestr, ClassS) + "(__b, __c)";
-    if (typestr[0] != 'U') {
-      // vabd results are always unsigned and must be zero-extended.
-      std::string utype = "U" + typestr.str();
-      s += "(" + TypeString(proto[0], typestr) + ")";
-      abd = "(" + TypeString('d', utype) + ")" + abd;
-      s += Extend(utype, abd) + ";";
+  case OpAbal:
+    s += "__a + " + MangleName("vabdl", typestr, ClassS) + "(__b, __c);";
+    break;
+  case OpAbalHi:
+    s += Gen3OpWith2High(typestr, "vabal", "__a", "__b", "__c");
+    break;
+  case OpQDMullHi:
+    s += Gen2OpWith2High(typestr, "vqdmull", "__a", "__b");
+    break;
+  case OpQDMullHiN:
+    s += MangleName("vqdmull_n", typestr, ClassS);
+    s += "(" + GetHigh("__a", typestr) + ", __b);";
+    return s;
+  case OpQDMlalHi:
+    s += Gen3OpWith2High(typestr, "vqdmlal", "__a", "__b", "__c");
+    break;
+  case OpQDMlalHiN:
+    s += MangleName("vqdmlal_n", typestr, ClassS);
+    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
+    return s;
+  case OpQDMlslHi:
+    s += Gen3OpWith2High(typestr, "vqdmlsl", "__a", "__b", "__c");
+    break;
+  case OpQDMlslHiN:
+    s += MangleName("vqdmlsl_n", typestr, ClassS);
+    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
+    return s;
+  case OpDiv:
+    s += "__a / __b;";
+    break;
+  case OpMovlHi: {
+    s = TypeString(proto[1], typestr.drop_front()) + " __a1 = " +
+        MangleName("vget_high", typestr, ClassS) + "(__a);\n  " + s;
+    s += "(" + ts + ")" + MangleName("vshll_n", typestr, ClassS);
+    s += "(__a1, 0);";
+    break;
+  }
+  case OpLongHi: {
+    // Another local variable __a1 is needed for calling a Macro,
+    // or using __a will have naming conflict when Macro expanding.
+    s += TypeString(proto[1], typestr.drop_front()) + " __a1 = " +
+         MangleName("vget_high", typestr, ClassS) + "(__a); \\\n";
+    s += "  (" + ts + ")" + MangleName(RemoveHigh(name), typestr, ClassS) +
+         "(__a1, __b);";
+    break;
+  }
+  case OpNarrowHi: {
+    s += "(" + ts + ")" + MangleName("vcombine", typestr, ClassS) + "(__a, " +
+         MangleName(RemoveHigh(name), typestr, ClassS) + "(__b, __c));";
+    break;
+  }
+  case OpCopyLane: {
+    s += TypeString('s', typestr) + " __c2 = " +
+         MangleName("vget_lane", typestr, ClassS) + "(__c1, __d1); \\\n  " +
+         MangleName("vset_lane", typestr, ClassS) + "(__c2, __a1, __b1);";
+    break;
+  }
+  case OpCopyQLane: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += TypeString('s', typestr) + " __c2 = vget_lane_" + typeCode +
+         "(__c1, __d1); \\\n  vsetq_lane_" + typeCode + "(__c2, __a1, __b1);";
+    break;
+  }
+  case OpCopyLaneQ: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += TypeString('s', typestr) + " __c2 = vgetq_lane_" + typeCode +
+         "(__c1, __d1); \\\n  vset_lane_" + typeCode + "(__c2, __a1, __b1);";
+    break;
+  }
+  case OpScalarMulLane: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += TypeString('s', typestr) + " __d1 = vget_lane_" + typeCode +
+      "(__b, __c);\\\n  __a * __d1;";
+    break;
+  }
+  case OpScalarMulLaneQ: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+        s += TypeString('s', typestr) + " __d1 = vgetq_lane_" + typeCode +
+          "(__b, __c);\\\n  __a * __d1;";
+    break;
+  }
+  case OpScalarMulXLane: {
+    bool dummy = false;
+    char type = ClassifyType(typestr, dummy, dummy, dummy);
+    if (type == 'f') type = 's';
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += TypeString('s', typestr) + " __d1 = vget_lane_" + typeCode +
+      "(__b, __c);\\\n  vmulx" + type + "_" +
+      typeCode +  "(__a, __d1);";
+    break;
+  }
+  case OpScalarMulXLaneQ: {
+    bool dummy = false;
+    char type = ClassifyType(typestr, dummy, dummy, dummy);
+    if (type == 'f') type = 's';
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += TypeString('s', typestr) + " __d1 = vgetq_lane_" +
+      typeCode + "(__b, __c);\\\n  vmulx" + type +
+      "_" + typeCode +  "(__a, __d1);";
+    break;
+  }
+
+  case OpScalarVMulXLane: {
+    bool dummy = false;
+    char type = ClassifyType(typestr, dummy, dummy, dummy);
+    if (type == 'f') type = 's';
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += TypeString('s', typestr) + " __d1 = vget_lane_" +
+      typeCode + "(__a, 0);\\\n" +
+      "  " + TypeString('s', typestr) + " __e1 = vget_lane_" +
+      typeCode + "(__b, __c);\\\n" +
+      "  " + TypeString('s', typestr) + " __f1 = vmulx" + type + "_" +
+      typeCode + "(__d1, __e1);\\\n" +
+      "  " + TypeString('d', typestr) + " __g1;\\\n" +
+      "  vset_lane_" + typeCode + "(__f1, __g1, __c);";
+    break;
+  }
+
+  case OpScalarVMulXLaneQ: {
+    bool dummy = false;
+    char type = ClassifyType(typestr, dummy, dummy, dummy);
+    if (type == 'f') type = 's';
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += TypeString('s', typestr) + " __d1 = vget_lane_" +
+      typeCode + "(__a, 0);\\\n" +
+      "  " + TypeString('s', typestr) + " __e1 = vgetq_lane_" +
+      typeCode + "(__b, __c);\\\n" +
+      "  " + TypeString('s', typestr) + " __f1 = vmulx" + type + "_" +
+      typeCode + "(__d1, __e1);\\\n" +
+      "  " + TypeString('d', typestr) + " __g1;\\\n" +
+      "  vset_lane_" + typeCode + "(__f1, __g1, 0);";
+    break;
+  }
+  case OpScalarQDMullLane: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
+    "vget_lane_" + typeCode + "(b, __c));";
+    break;
+  }
+  case OpScalarQDMullLaneQ: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
+    "vgetq_lane_" + typeCode + "(b, __c));";
+    break;
+  }
+  case OpScalarQDMulHiLane: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " +
+    "vget_lane_" + typeCode + "(__b, __c));";
+    break;
+  }
+  case OpScalarQDMulHiLaneQ: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " +
+    "vgetq_lane_" + typeCode + "(__b, __c));";
+    break;
+  }
+  case OpScalarQRDMulHiLane: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += MangleName("vqrdmulh", typestr, ClassS) + "(__a, " +
+    "vget_lane_" + typeCode + "(__b, __c));";
+    break;
+  }
+  case OpScalarQRDMulHiLaneQ: {
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += MangleName("vqrdmulh", typestr, ClassS) + "(__a, " +
+    "vgetq_lane_" + typeCode + "(__b, __c));";
+    break;
+  }
+  case OpScalarGetLane:{
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    if (quad) {
+     s += "int16x8_t __a1 = vreinterpretq_s16_f16(__a);\\\n";
+     s += "  vgetq_lane_s16(__a1, __b);";
     } else {
-      s += Extend(typestr, abd) + ";";
+     s += "int16x4_t __a1 = vreinterpret_s16_f16(__a);\\\n";
+     s += "  vget_lane_s16(__a1, __b);";
     }
     break;
   }
+  case OpScalarSetLane:{
+    std::string typeCode = "";
+    InstructionTypeCode(typestr, ClassS, quad, typeCode);
+    s += "int16_t __a1 = (int16_t)__a;\\\n";
+    if (quad) {
+     s += "  int16x8_t __b1 = vreinterpretq_s16_f16(b);\\\n";
+     s += "  int16x8_t __b2 = vsetq_lane_s16(__a1, __b1, __c);\\\n";
+     s += "  vreinterpretq_f16_s16(__b2);";
+    } else {
+     s += "  int16x4_t __b1 = vreinterpret_s16_f16(b);\\\n";
+     s += "  int16x4_t __b2 = vset_lane_s16(__a1, __b1, __c);\\\n";
+     s += "  vreinterpret_f16_s16(__b2);";
+    }
+    break;
+  }
+
   default:
     PrintFatalError("unknown OpKind!");
   }
@@ -1494,7 +2249,7 @@ static std::string GenOpString(OpKind op, const std::string &proto,
 static unsigned GetNeonEnum(const std::string &proto, StringRef typestr) {
   unsigned mod = proto[0];
 
-  if (mod == 'v' || mod == 'f')
+  if (mod == 'v' || mod == 'f' || mod == 'F')
     mod = proto[1];
 
   bool quad = false;
@@ -1522,7 +2277,7 @@ static unsigned GetNeonEnum(const std::string &proto, StringRef typestr) {
       ET = NeonTypeFlags::Int32;
       break;
     case 'l':
-      ET = NeonTypeFlags::Int64;
+      ET = poly ? NeonTypeFlags::Poly64 : NeonTypeFlags::Int64;
       break;
     case 'h':
       ET = NeonTypeFlags::Float16;
@@ -1530,6 +2285,9 @@ static unsigned GetNeonEnum(const std::string &proto, StringRef typestr) {
     case 'f':
       ET = NeonTypeFlags::Float32;
       break;
+    case 'd':
+      ET = NeonTypeFlags::Float64;
+      break;
     default:
       PrintFatalError("unhandled type!");
   }
@@ -1537,6 +2295,19 @@ static unsigned GetNeonEnum(const std::string &proto, StringRef typestr) {
   return Flags.getFlags();
 }
 
+// We don't check 'a' in this function, because for builtin function the
+// argument matching to 'a' uses a vector type splatted from a scalar type.
+static bool ProtoHasScalar(const std::string proto)
+{
+  return (proto.find('s') != std::string::npos
+          || proto.find('z') != std::string::npos
+          || proto.find('r') != std::string::npos
+          || proto.find('b') != std::string::npos
+          || proto.find('$') != std::string::npos
+          || proto.find('y') != std::string::npos
+          || proto.find('o') != std::string::npos);
+}
+
 // Generate the definition for this intrinsic, e.g. __builtin_neon_cls(a)
 static std::string GenBuiltin(const std::string &name, const std::string &proto,
                               StringRef typestr, ClassKind ck) {
@@ -1544,14 +2315,14 @@ static std::string GenBuiltin(const std::string &name, const std::string &proto,
 
   // If this builtin returns a struct 2, 3, or 4 vectors, pass it as an implicit
   // sret-like argument.
-  bool sret = (proto[0] >= '2' && proto[0] <= '4');
+  bool sret = IsMultiVecProto(proto[0]);
 
   bool define = UseMacro(proto);
 
   // Check if the prototype has a scalar operand with the type of the vector
   // elements.  If not, bitcasting the args will take care of arg checking.
   // The actual signedness etc. will be taken care of with special enums.
-  if (proto.find('s') == std::string::npos)
+  if (!ProtoHasScalar(proto))
     ck = ClassB;
 
   if (proto[0] != 'v') {
@@ -1604,12 +2375,19 @@ static std::string GenBuiltin(const std::string &name, const std::string &proto,
 
     // Handle multiple-vector values specially, emitting each subvector as an
     // argument to the __builtin.
+    unsigned NumOfVec = 0;
     if (proto[i] >= '2' && proto[i] <= '4') {
+      NumOfVec = proto[i] - '0';
+    } else if (proto[i] >= 'B' && proto[i] <= 'D') {
+      NumOfVec = proto[i] - 'A' + 1;
+    }
+    
+    if (NumOfVec > 0) {
       // Check if an explicit cast is needed.
       if (argType != 'c' || argPoly || argUsgn)
         args = (argQuad ? "(int8x16_t)" : "(int8x8_t)") + args;
 
-      for (unsigned vi = 0, ve = proto[i] - '0'; vi != ve; ++vi) {
+      for (unsigned vi = 0, ve = NumOfVec; vi != ve; ++vi) {
         s += args + ".val[" + utostr(vi) + "]";
         if ((vi + 1) < ve)
           s += ", ";
@@ -1662,7 +2440,7 @@ static std::string GenBuiltinDef(const std::string &name,
   // If all types are the same size, bitcasting the args will take care
   // of arg checking.  The actual signedness etc. will be taken care of with
   // special enums.
-  if (proto.find('s') == std::string::npos)
+  if (!ProtoHasScalar(proto))
     ck = ClassB;
 
   s += MangleName(name, typestr, ck);
@@ -1706,12 +2484,12 @@ static std::string GenIntrinsic(const std::string &name,
   s += mangledName;
 
   // Function arguments
-  s += GenArgs(proto, inTypeStr);
+  s += GenArgs(proto, inTypeStr, name);
 
   // Definition.
   if (define) {
     s += " __extension__ ({ \\\n  ";
-    s += GenMacroLocals(proto, inTypeStr);
+    s += GenMacroLocals(proto, inTypeStr, name);
   } else if (kind == OpUnavailable) {
     s += " __attribute__((unavailable));\n";
     return s;
@@ -1719,7 +2497,7 @@ static std::string GenIntrinsic(const std::string &name,
     s += " {\n  ";
 
   if (kind != OpNone)
-    s += GenOpString(kind, proto, outTypeStr);
+    s += GenOpString(name, kind, proto, outTypeStr);
   else
     s += GenBuiltin(name, proto, outTypeStr, classKind);
   if (define)
@@ -1773,7 +2551,7 @@ void NeonEmitter::run(raw_ostream &OS) {
   OS << "#ifndef __ARM_NEON_H\n";
   OS << "#define __ARM_NEON_H\n\n";
 
-  OS << "#ifndef __ARM_NEON__\n";
+  OS << "#if !defined(__ARM_NEON__) && !defined(__ARM_NEON)\n";
   OS << "#error \"NEON support not enabled\"\n";
   OS << "#endif\n\n";
 
@@ -1781,19 +2559,50 @@ void NeonEmitter::run(raw_ostream &OS) {
 
   // Emit NEON-specific scalar typedefs.
   OS << "typedef float float32_t;\n";
+  OS << "typedef __fp16 float16_t;\n";
+
+  OS << "#ifdef __aarch64__\n";
+  OS << "typedef double float64_t;\n";
+  OS << "#endif\n\n";
+
+  // For now, signedness of polynomial types depends on target
+  OS << "#ifdef __aarch64__\n";
+  OS << "typedef uint8_t poly8_t;\n";
+  OS << "typedef uint16_t poly16_t;\n";
+  OS << "typedef uint64_t poly64_t;\n";
+  OS << "#else\n";
   OS << "typedef int8_t poly8_t;\n";
   OS << "typedef int16_t poly16_t;\n";
-  OS << "typedef uint16_t float16_t;\n";
+  OS << "#endif\n";
 
   // Emit Neon vector typedefs.
-  std::string TypedefTypes("cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfPcQPcPsQPs");
+  std::string TypedefTypes(
+      "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPsPlQPl");
   SmallVector<StringRef, 24> TDTypeVec;
   ParseTypes(0, TypedefTypes, TDTypeVec);
 
   // Emit vector typedefs.
+  bool isA64 = false;
+  bool preinsert;
+  bool postinsert;
   for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
     bool dummy, quad = false, poly = false;
-    (void) ClassifyType(TDTypeVec[i], quad, poly, dummy);
+    char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
+    preinsert = false;
+    postinsert = false;
+
+    if (type == 'd' || (type == 'l' && poly)) {
+      preinsert = isA64? false: true;
+      isA64 = true;
+    } else {
+      postinsert = isA64? true: false;
+      isA64 = false;
+    }
+    if (postinsert)
+      OS << "#endif\n";
+    if (preinsert)
+      OS << "#ifdef __aarch64__\n";
+
     if (poly)
       OS << "typedef __attribute__((neon_polyvector_type(";
     else
@@ -1806,50 +2615,130 @@ void NeonEmitter::run(raw_ostream &OS) {
 
     OS << TypeString('s', TDTypeVec[i]);
     OS << " " << TypeString('d', TDTypeVec[i]) << ";\n";
+
   }
+  postinsert = isA64? true: false;
+  if (postinsert)
+    OS << "#endif\n";
   OS << "\n";
 
   // Emit struct typedefs.
+  isA64 = false;
   for (unsigned vi = 2; vi != 5; ++vi) {
     for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
+      bool dummy, quad = false, poly = false;
+      char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
+      preinsert = false;
+      postinsert = false;
+
+      if (type == 'd' || (type == 'l' && poly)) {
+        preinsert = isA64? false: true;
+        isA64 = true;
+      } else {
+        postinsert = isA64? true: false;
+        isA64 = false;
+      }
+      if (postinsert)
+        OS << "#endif\n";
+      if (preinsert)
+        OS << "#ifdef __aarch64__\n";
+
       std::string ts = TypeString('d', TDTypeVec[i]);
       std::string vs = TypeString('0' + vi, TDTypeVec[i]);
       OS << "typedef struct " << vs << " {\n";
       OS << "  " << ts << " val";
       OS << "[" << utostr(vi) << "]";
       OS << ";\n} ";
-      OS << vs << ";\n\n";
+      OS << vs << ";\n";
+      OS << "\n";
     }
   }
+  postinsert = isA64? true: false;
+  if (postinsert)
+    OS << "#endif\n";
+  OS << "\n";
 
   OS<<"#define __ai static inline __attribute__((__always_inline__, __nodebug__))\n\n";
 
   std::vector<Record*> RV = Records.getAllDerivedDefinitions("Inst");
 
+  StringMap<ClassKind> EmittedMap;
+
   // Emit vmovl, vmull and vabd intrinsics first so they can be used by other
   // intrinsics.  (Some of the saturating multiply instructions are also
   // used to implement the corresponding "_lane" variants, but tablegen
   // sorts the records into alphabetical order so that the "_lane" variants
   // come after the intrinsics they use.)
-  emitIntrinsic(OS, Records.getDef("VMOVL"));
-  emitIntrinsic(OS, Records.getDef("VMULL"));
-  emitIntrinsic(OS, Records.getDef("VABD"));
-
+  emitIntrinsic(OS, Records.getDef("VMOVL"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VMULL"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VABD"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VABDL"), EmittedMap);
+
+  // ARM intrinsics must be emitted before AArch64 intrinsics to ensure
+  // common intrinsics appear only once in the output stream.
+  // The check for uniquiness is done in emitIntrinsic.
+  // Emit ARM intrinsics.
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
-    if (R->getName() != "VMOVL" &&
-        R->getName() != "VMULL" &&
+
+    // Skip AArch64 intrinsics; they will be emitted at the end.
+    bool isA64 = R->getValueAsBit("isA64");
+    if (isA64)
+      continue;
+
+    if (R->getName() != "VMOVL" && R->getName() != "VMULL" &&
         R->getName() != "VABD")
-      emitIntrinsic(OS, R);
+      emitIntrinsic(OS, R, EmittedMap);
+  }
+
+  // Emit AArch64-specific intrinsics.
+  OS << "#ifdef __aarch64__\n";
+
+  emitIntrinsic(OS, Records.getDef("VMOVL_HIGH"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VMULL_HIGH"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VABDL_HIGH"), EmittedMap);
+
+  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
+    Record *R = RV[i];
+
+    // Skip ARM intrinsics already included above.
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64)
+      continue;
+
+    // Skip crypto temporarily, and will emit them all together at the end.
+    bool isCrypto = R->getValueAsBit("isCrypto");
+    if (isCrypto)
+      continue;
+
+    emitIntrinsic(OS, R, EmittedMap);
+  }
+
+  OS << "#ifdef __ARM_FEATURE_CRYPTO\n";
+
+  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
+    Record *R = RV[i];
+
+    // Skip crypto temporarily, and will emit them all together at the end.
+    bool isCrypto = R->getValueAsBit("isCrypto");
+    if (!isCrypto)
+      continue;
+
+    emitIntrinsic(OS, R, EmittedMap);
   }
+  
+  OS << "#endif\n\n";
+
+  OS << "#endif\n\n";
 
   OS << "#undef __ai\n\n";
   OS << "#endif /* __ARM_NEON_H */\n";
 }
 
 /// emitIntrinsic - Write out the arm_neon.h header file definitions for the
-/// intrinsics specified by record R.
-void NeonEmitter::emitIntrinsic(raw_ostream &OS, Record *R) {
+/// intrinsics specified by record R checking for intrinsic uniqueness.
+void NeonEmitter::emitIntrinsic(raw_ostream &OS, Record *R,
+                                StringMap<ClassKind> &EmittedMap) {
   std::string name = R->getValueAsString("Name");
   std::string Proto = R->getValueAsString("Prototype");
   std::string Types = R->getValueAsString("Types");
@@ -1876,12 +2765,20 @@ void NeonEmitter::emitIntrinsic(raw_ostream &OS, Record *R) {
         (void)ClassifyType(TypeVec[srcti], inQuad, dummy, dummy);
         if (srcti == ti || inQuad != outQuad)
           continue;
-        OS << GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[srcti],
-                           OpCast, ClassS);
+        std::string s = GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[srcti],
+                                     OpCast, ClassS);
+        if (EmittedMap.count(s))
+          continue;
+        EmittedMap[s] = ClassS;
+        OS << s;
       }
     } else {
-      OS << GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[ti],
-                         kind, classKind);
+      std::string s =
+          GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[ti], kind, classKind);
+      if (EmittedMap.count(s))
+        continue;
+      EmittedMap[s] = classKind;
+      OS << s;
     }
   }
   OS << "\n";
@@ -1902,6 +2799,7 @@ static unsigned RangeFromType(const char mod, StringRef typestr) {
     case 'f':
     case 'i':
       return (2 << (int)quad) - 1;
+    case 'd':
     case 'l':
       return (1 << (int)quad) - 1;
     default:
@@ -1909,56 +2807,198 @@ static unsigned RangeFromType(const char mod, StringRef typestr) {
   }
 }
 
-/// runHeader - Emit a file with sections defining:
-/// 1. the NEON section of BuiltinsARM.def.
-/// 2. the SemaChecking code for the type overload checking.
-/// 3. the SemaChecking code for validation of intrinsic immediate arguments.
-void NeonEmitter::runHeader(raw_ostream &OS) {
-  std::vector<Record*> RV = Records.getAllDerivedDefinitions("Inst");
+static unsigned RangeScalarShiftImm(const char mod, StringRef typestr) {
+  // base type to get the type string for.
+  bool dummy = false;
+  char type = ClassifyType(typestr, dummy, dummy, dummy);
+  type = ModType(mod, type, dummy, dummy, dummy, dummy, dummy, dummy);
 
+  switch (type) {
+    case 'c':
+      return 7;
+    case 'h':
+    case 's':
+      return 15;
+    case 'f':
+    case 'i':
+      return 31;
+    case 'd':
+    case 'l':
+      return 63;
+    default:
+      PrintFatalError("unhandled type!");
+  }
+}
+
+/// Generate the ARM and AArch64 intrinsic range checking code for
+/// shift/lane immediates, checking for unique declarations.
+void
+NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS,
+                                        StringMap<ClassKind> &A64IntrinsicMap,
+                                        bool isA64RangeCheck) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
   StringMap<OpKind> EmittedMap;
 
-  // Generate BuiltinsARM.def for NEON
-  OS << "#ifdef GET_NEON_BUILTINS\n";
+  // Generate the intrinsic range checking code for shift/lane immediates.
+  if (isA64RangeCheck)
+    OS << "#ifdef GET_NEON_AARCH64_IMMEDIATE_CHECK\n";
+  else
+    OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
+
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
+
     OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
     if (k != OpNone)
       continue;
 
+    std::string name = R->getValueAsString("Name");
     std::string Proto = R->getValueAsString("Prototype");
+    std::string Types = R->getValueAsString("Types");
+    std::string Rename = name + "@" + Proto;
 
     // Functions with 'a' (the splat code) in the type prototype should not get
     // their own builtin as they use the non-splat variant.
     if (Proto.find('a') != std::string::npos)
       continue;
 
-    std::string Types = R->getValueAsString("Types");
+    // Functions which do not have an immediate do not need to have range
+    // checking code emitted.
+    size_t immPos = Proto.find('i');
+    if (immPos == std::string::npos)
+      continue;
+
     SmallVector<StringRef, 16> TypeVec;
     ParseTypes(R, Types, TypeVec);
 
     if (R->getSuperClasses().size() < 2)
       PrintFatalError(R->getLoc(), "Builtin has no class kind");
 
-    std::string name = R->getValueAsString("Name");
     ClassKind ck = ClassMap[R->getSuperClasses()[1]];
+    if (!ProtoHasScalar(Proto))
+      ck = ClassB;
+
+    // Do not include AArch64 range checks if not generating code for AArch64.
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64RangeCheck && isA64)
+      continue;
+
+    // Include ARM range checks in AArch64 but only if ARM intrinsics are not
+    // redefined by AArch64 to handle new types.
+    if (isA64RangeCheck && !isA64 && A64IntrinsicMap.count(Rename)) {
+      ClassKind &A64CK = A64IntrinsicMap[Rename];
+      if (A64CK == ck && ck != ClassNone)
+        continue;
+    }
 
     for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
-      // Generate the BuiltinsARM.def declaration for this builtin, ensuring
-      // that each unique BUILTIN() macro appears only once in the output
-      // stream.
-      std::string bd = GenBuiltinDef(name, Proto, TypeVec[ti], ck);
-      if (EmittedMap.count(bd))
+      std::string namestr, shiftstr, rangestr;
+
+      if (R->getValueAsBit("isVCVT_N")) {
+        // VCVT between floating- and fixed-point values takes an immediate
+        // in the range [1, 32] for f32, or [1, 64] for f64.
+        ck = ClassB;
+        if (name.find("32") != std::string::npos)
+          rangestr = "l = 1; u = 31"; // upper bound = l + u
+        else if (name.find("64") != std::string::npos)
+          rangestr = "l = 1; u = 63";
+        else
+          PrintFatalError(R->getLoc(),
+              "Fixed point convert name should contains \"32\" or \"64\"");
+
+      } else if (R->getValueAsBit("isScalarShift")) {
+        // Right shifts have an 'r' in the name, left shifts do not.  Convert
+        // instructions have the same bounds and right shifts.
+        if (name.find('r') != std::string::npos ||
+            name.find("cvt") != std::string::npos)
+          rangestr = "l = 1; ";
+
+        unsigned upBound = RangeScalarShiftImm(Proto[immPos - 1], TypeVec[ti]);
+        // Narrow shift has half the upper bound
+        if (R->getValueAsBit("isScalarNarrowShift"))
+          upBound /= 2;
+
+        rangestr += "u = " + utostr(upBound);
+      } else if (R->getValueAsBit("isShift")) {
+        // Builtins which are overloaded by type will need to have their upper
+        // bound computed at Sema time based on the type constant.
+        shiftstr = ", true";
+
+        // Right shifts have an 'r' in the name, left shifts do not.
+        if (name.find('r') != std::string::npos)
+          rangestr = "l = 1; ";
+
+        rangestr += "u = RFT(TV" + shiftstr + ")";
+      } else {
+        // The immediate generally refers to a lane in the preceding argument.
+        assert(immPos > 0 && "unexpected immediate operand");
+        rangestr =
+            "u = " + utostr(RangeFromType(Proto[immPos - 1], TypeVec[ti]));
+      }
+      // Make sure cases appear only once by uniquing them in a string map.
+      namestr = MangleName(name, TypeVec[ti], ck);
+      if (EmittedMap.count(namestr))
         continue;
+      EmittedMap[namestr] = OpNone;
 
-      EmittedMap[bd] = OpNone;
-      OS << bd << "\n";
+      // Calculate the index of the immediate that should be range checked.
+      unsigned immidx = 0;
+
+      // Builtins that return a struct of multiple vectors have an extra
+      // leading arg for the struct return.
+      if (IsMultiVecProto(Proto[0]))
+        ++immidx;
+
+      // Add one to the index for each argument until we reach the immediate
+      // to be checked.  Structs of vectors are passed as multiple arguments.
+      for (unsigned ii = 1, ie = Proto.size(); ii != ie; ++ii) {
+        switch (Proto[ii]) {
+        default:
+          immidx += 1;
+          break;
+        case '2':
+        case 'B':
+          immidx += 2;
+          break;
+        case '3':
+        case 'C':
+          immidx += 3;
+          break;
+        case '4':
+        case 'D':
+          immidx += 4;
+          break;
+        case 'i':
+          ie = ii + 1;
+          break;
+        }
+      }
+      if (isA64RangeCheck)
+        OS << "case AArch64::BI__builtin_neon_";
+      else
+        OS << "case ARM::BI__builtin_neon_";
+      OS << MangleName(name, TypeVec[ti], ck) << ": i = " << immidx << "; "
+         << rangestr << "; break;\n";
     }
   }
   OS << "#endif\n\n";
+}
+
+/// Generate the ARM and AArch64 overloaded type checking code for
+/// SemaChecking.cpp, checking for unique builtin declarations.
+void
+NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS,
+                                      StringMap<ClassKind> &A64IntrinsicMap,
+                                      bool isA64TypeCheck) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  StringMap<OpKind> EmittedMap;
 
   // Generate the overloaded type checking code for SemaChecking.cpp
-  OS << "#ifdef GET_NEON_OVERLOAD_CHECK\n";
+  if (isA64TypeCheck)
+    OS << "#ifdef GET_NEON_AARCH64_OVERLOAD_CHECK\n";
+  else
+    OS << "#ifdef GET_NEON_OVERLOAD_CHECK\n";
+
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
     OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
@@ -1968,7 +3008,8 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
     std::string Proto = R->getValueAsString("Prototype");
     std::string Types = R->getValueAsString("Types");
     std::string name = R->getValueAsString("Name");
-
+    std::string Rename = name + "@" + Proto;
+    
     // Functions with 'a' (the splat code) in the type prototype should not get
     // their own builtin as they use the non-splat variant.
     if (Proto.find('a') != std::string::npos)
@@ -1976,7 +3017,7 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
 
     // Functions which have a scalar argument cannot be overloaded, no need to
     // check them if we are emitting the type checking code.
-    if (Proto.find('s') != std::string::npos)
+    if (ProtoHasScalar(Proto))
       continue;
 
     SmallVector<StringRef, 16> TypeVec;
@@ -1985,6 +3026,21 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
     if (R->getSuperClasses().size() < 2)
       PrintFatalError(R->getLoc(), "Builtin has no class kind");
 
+    // Do not include AArch64 type checks if not generating code for AArch64.
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64TypeCheck && isA64)
+      continue;
+
+    // Include ARM  type check in AArch64 but only if ARM intrinsics
+    // are not redefined in AArch64 to handle new types, e.g. "vabd" is a SIntr
+    // redefined in AArch64 to handle an additional 2 x f64 type.
+    ClassKind ck = ClassMap[R->getSuperClasses()[1]];
+    if (isA64TypeCheck && !isA64 && A64IntrinsicMap.count(Rename)) {
+      ClassKind &A64CK = A64IntrinsicMap[Rename];
+      if (A64CK == ck && ck != ClassNone)
+        continue;
+    }
+
     int si = -1, qi = -1;
     uint64_t mask = 0, qmask = 0;
     for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
@@ -2017,7 +3073,7 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
       }
     }
     // For sret builtins, adjust the pointer argument index.
-    if (PtrArgNum >= 0 && (Proto[0] >= '2' && Proto[0] <= '4'))
+    if (PtrArgNum >= 0 && IsMultiVecProto(Proto[0]))
       PtrArgNum += 1;
 
     // Omit type checking for the pointer arguments of vld1_lane, vld1_dup,
@@ -2032,9 +3088,12 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
     }
 
     if (mask) {
-      OS << "case ARM::BI__builtin_neon_"
-         << MangleName(name, TypeVec[si], ClassB)
-         << ": mask = " << "0x" << utohexstr(mask) << "ULL";
+      if (isA64TypeCheck)
+        OS << "case AArch64::BI__builtin_neon_";
+      else
+        OS << "case ARM::BI__builtin_neon_";
+      OS << MangleName(name, TypeVec[si], ClassB) << ": mask = "
+         << "0x" << utohexstr(mask) << "ULL";
       if (PtrArgNum >= 0)
         OS << "; PtrArgNum = " << PtrArgNum;
       if (HasConstPtr)
@@ -2042,9 +3101,12 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
       OS << "; break;\n";
     }
     if (qmask) {
-      OS << "case ARM::BI__builtin_neon_"
-         << MangleName(name, TypeVec[qi], ClassB)
-         << ": mask = " << "0x" << utohexstr(qmask) << "ULL";
+      if (isA64TypeCheck)
+        OS << "case AArch64::BI__builtin_neon_";
+      else
+        OS << "case ARM::BI__builtin_neon_";
+      OS << MangleName(name, TypeVec[qi], ClassB) << ": mask = "
+         << "0x" << utohexstr(qmask) << "ULL";
       if (PtrArgNum >= 0)
         OS << "; PtrArgNum = " << PtrArgNum;
       if (HasConstPtr)
@@ -2053,31 +3115,38 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
     }
   }
   OS << "#endif\n\n";
+}
+
+/// genBuiltinsDef: Generate the BuiltinsARM.def and  BuiltinsAArch64.def
+/// declaration of builtins, checking for unique builtin declarations.
+void NeonEmitter::genBuiltinsDef(raw_ostream &OS,
+                                 StringMap<ClassKind> &A64IntrinsicMap,
+                                 bool isA64GenBuiltinDef) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  StringMap<OpKind> EmittedMap;
+
+  // Generate BuiltinsARM.def and BuiltinsAArch64.def
+  if (isA64GenBuiltinDef)
+    OS << "#ifdef GET_NEON_AARCH64_BUILTINS\n";
+  else
+    OS << "#ifdef GET_NEON_BUILTINS\n";
 
-  // Generate the intrinsic range checking code for shift/lane immediates.
-  OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
-
     OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
     if (k != OpNone)
       continue;
 
-    std::string name = R->getValueAsString("Name");
     std::string Proto = R->getValueAsString("Prototype");
-    std::string Types = R->getValueAsString("Types");
+    std::string name = R->getValueAsString("Name");
+    std::string Rename = name + "@" + Proto;
 
     // Functions with 'a' (the splat code) in the type prototype should not get
     // their own builtin as they use the non-splat variant.
     if (Proto.find('a') != std::string::npos)
       continue;
 
-    // Functions which do not have an immediate do not need to have range
-    // checking code emitted.
-    size_t immPos = Proto.find('i');
-    if (immPos == std::string::npos)
-      continue;
-
+    std::string Types = R->getValueAsString("Types");
     SmallVector<StringRef, 16> TypeVec;
     ParseTypes(R, Types, TypeVec);
 
@@ -2086,70 +3155,92 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
 
     ClassKind ck = ClassMap[R->getSuperClasses()[1]];
 
-    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
-      std::string namestr, shiftstr, rangestr;
-
-      if (R->getValueAsBit("isVCVT_N")) {
-        // VCVT between floating- and fixed-point values takes an immediate
-        // in the range 1 to 32.
-        ck = ClassB;
-        rangestr = "l = 1; u = 31"; // upper bound = l + u
-      } else if (Proto.find('s') == std::string::npos) {
-        // Builtins which are overloaded by type will need to have their upper
-        // bound computed at Sema time based on the type constant.
-        ck = ClassB;
-        if (R->getValueAsBit("isShift")) {
-          shiftstr = ", true";
+    // Do not include AArch64 BUILTIN() macros if not generating
+    // code for AArch64
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64GenBuiltinDef && isA64)
+      continue;
 
-          // Right shifts have an 'r' in the name, left shifts do not.
-          if (name.find('r') != std::string::npos)
-            rangestr = "l = 1; ";
-        }
-        rangestr += "u = RFT(TV" + shiftstr + ")";
-      } else {
-        // The immediate generally refers to a lane in the preceding argument.
-        assert(immPos > 0 && "unexpected immediate operand");
-        rangestr = "u = " + utostr(RangeFromType(Proto[immPos-1], TypeVec[ti]));
-      }
-      // Make sure cases appear only once by uniquing them in a string map.
-      namestr = MangleName(name, TypeVec[ti], ck);
-      if (EmittedMap.count(namestr))
+    // Include ARM  BUILTIN() macros  in AArch64 but only if ARM intrinsics
+    // are not redefined in AArch64 to handle new types, e.g. "vabd" is a SIntr
+    // redefined in AArch64 to handle an additional 2 x f64 type.
+    if (isA64GenBuiltinDef && !isA64 && A64IntrinsicMap.count(Rename)) {
+      ClassKind &A64CK = A64IntrinsicMap[Rename];
+      if (A64CK == ck && ck != ClassNone)
         continue;
-      EmittedMap[namestr] = OpNone;
-
-      // Calculate the index of the immediate that should be range checked.
-      unsigned immidx = 0;
+    }
 
-      // Builtins that return a struct of multiple vectors have an extra
-      // leading arg for the struct return.
-      if (Proto[0] >= '2' && Proto[0] <= '4')
-        ++immidx;
+    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
+      // Generate the declaration for this builtin, ensuring
+      // that each unique BUILTIN() macro appears only once in the output
+      // stream.
+      std::string bd = GenBuiltinDef(name, Proto, TypeVec[ti], ck);
+      if (EmittedMap.count(bd))
+        continue;
 
-      // Add one to the index for each argument until we reach the immediate
-      // to be checked.  Structs of vectors are passed as multiple arguments.
-      for (unsigned ii = 1, ie = Proto.size(); ii != ie; ++ii) {
-        switch (Proto[ii]) {
-          default:  immidx += 1; break;
-          case '2': immidx += 2; break;
-          case '3': immidx += 3; break;
-          case '4': immidx += 4; break;
-          case 'i': ie = ii + 1; break;
-        }
-      }
-      OS << "case ARM::BI__builtin_neon_" << MangleName(name, TypeVec[ti], ck)
-         << ": i = " << immidx << "; " << rangestr << "; break;\n";
+      EmittedMap[bd] = OpNone;
+      OS << bd << "\n";
     }
   }
   OS << "#endif\n\n";
 }
 
+/// runHeader - Emit a file with sections defining:
+/// 1. the NEON section of BuiltinsARM.def and BuiltinsAArch64.def.
+/// 2. the SemaChecking code for the type overload checking.
+/// 3. the SemaChecking code for validation of intrinsic immediate arguments.
+void NeonEmitter::runHeader(raw_ostream &OS) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+
+  // build a map of AArch64 intriniscs to be used in uniqueness checks.
+  StringMap<ClassKind> A64IntrinsicMap;
+  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
+    Record *R = RV[i];
+
+    bool isA64 = R->getValueAsBit("isA64");
+    if (!isA64)
+      continue;
+
+    ClassKind CK = ClassNone;
+    if (R->getSuperClasses().size() >= 2)
+      CK = ClassMap[R->getSuperClasses()[1]];
+
+    std::string Name = R->getValueAsString("Name");
+    std::string Proto = R->getValueAsString("Prototype");
+    std::string Rename = Name + "@" + Proto;
+    if (A64IntrinsicMap.count(Rename))
+      continue;
+    A64IntrinsicMap[Rename] = CK;
+  }
+
+  // Generate BuiltinsARM.def for ARM
+  genBuiltinsDef(OS, A64IntrinsicMap, false);
+
+  // Generate BuiltinsAArch64.def for AArch64
+  genBuiltinsDef(OS, A64IntrinsicMap, true);
+
+  // Generate ARM overloaded type checking code for SemaChecking.cpp
+  genOverloadTypeCheckCode(OS, A64IntrinsicMap, false);
+
+  // Generate AArch64 overloaded type checking code for SemaChecking.cpp
+  genOverloadTypeCheckCode(OS, A64IntrinsicMap, true);
+
+  // Generate ARM range checking code for shift/lane immediates.
+  genIntrinsicRangeCheckCode(OS, A64IntrinsicMap, false);
+
+  // Generate the AArch64 range checking code for shift/lane immediates.
+  genIntrinsicRangeCheckCode(OS, A64IntrinsicMap, true);
+}
+
 /// GenTest - Write out a test for the intrinsic specified by the name and
 /// type strings, including the embedded patterns for FileCheck to match.
 static std::string GenTest(const std::string &name,
                            const std::string &proto,
                            StringRef outTypeStr, StringRef inTypeStr,
                            bool isShift, bool isHiddenLOp,
-                           ClassKind ck, const std::string &InstName) {
+                           ClassKind ck, const std::string &InstName,
+                           bool isA64,
+                           std::string & testFuncProto) {
   assert(!proto.empty() && "");
   std::string s;
 
@@ -2164,12 +3255,17 @@ static std::string GenTest(const std::string &name,
     mangledName = MangleName(mangledName, inTypeNoQuad, ClassS);
   }
 
+  // todo: GenerateChecksForIntrinsic does not generate CHECK
+  // for aarch64 instructions yet
   std::vector<std::string> FileCheckPatterns;
-  GenerateChecksForIntrinsic(name, proto, outTypeStr, inTypeStr, ck, InstName,
-                             isHiddenLOp, FileCheckPatterns);
+  if (!isA64) {
+	GenerateChecksForIntrinsic(name, proto, outTypeStr, inTypeStr, ck, InstName,
+							   isHiddenLOp, FileCheckPatterns);
+	s+= "// CHECK_ARM: test_" + mangledName + "\n";
+  }
+  s += "// CHECK_AARCH64: test_" + mangledName + "\n";
 
   // Emit the FileCheck patterns.
-  s += "// CHECK: test_" + mangledName + "\n";
   // If for any reason we do not want to emit a check, mangledInst
   // will be the empty string.
   if (FileCheckPatterns.size()) {
@@ -2177,23 +3273,27 @@ static std::string GenTest(const std::string &name,
                                                   e = FileCheckPatterns.end();
          i != e;
          ++i) {
-      s += "// CHECK: " + *i + "\n";
+      s += "// CHECK_ARM: " + *i + "\n";
     }
   }
 
   // Emit the start of the test function.
-  s += TypeString(proto[0], outTypeStr) + " test_" + mangledName + "(";
+
+  testFuncProto = TypeString(proto[0], outTypeStr) + " test_" + mangledName + "(";
   char arg = 'a';
   std::string comma;
   for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
     // Do not create arguments for values that must be immediate constants.
     if (proto[i] == 'i')
       continue;
-    s += comma + TypeString(proto[i], inTypeStr) + " ";
-    s.push_back(arg);
+    testFuncProto += comma + TypeString(proto[i], inTypeStr) + " ";
+    testFuncProto.push_back(arg);
     comma = ", ";
   }
-  s += ") {\n  ";
+  testFuncProto += ")";
+
+  s+= testFuncProto;
+  s+= " {\n  ";
 
   if (proto[0] != 'v')
     s += "return ";
@@ -2217,18 +3317,14 @@ static std::string GenTest(const std::string &name,
   return s;
 }
 
-/// runTests - Write out a complete set of tests for all of the Neon
-/// intrinsics.
-void NeonEmitter::runTests(raw_ostream &OS) {
-  OS <<
-    "// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\\\n"
-    "// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\\\n"
-    "// RUN:  | FileCheck %s\n"
-    "\n"
-    "#include <arm_neon.h>\n"
-    "\n";
+/// Write out all intrinsic tests for the specified target, checking
+/// for intrinsic test uniqueness.
+void NeonEmitter::genTargetTest(raw_ostream &OS, StringMap<OpKind> &EmittedMap,
+                                bool isA64GenTest) {
+  if (isA64GenTest)
+	OS << "#ifdef __aarch64__\n";
 
-  std::vector<Record*> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];
     std::string name = R->getValueAsString("Name");
@@ -2237,6 +3333,12 @@ void NeonEmitter::runTests(raw_ostream &OS) {
     bool isShift = R->getValueAsBit("isShift");
     std::string InstName = R->getValueAsString("InstName");
     bool isHiddenLOp = R->getValueAsBit("isHiddenLInst");
+    bool isA64 = R->getValueAsBit("isA64");
+
+    // do not include AArch64 intrinsic test if not generating
+    // code for AArch64
+    if (!isA64GenTest && isA64)
+      continue;
 
     SmallVector<StringRef, 16> TypeVec;
     ParseTypes(R, Types, TypeVec);
@@ -2256,16 +3358,56 @@ void NeonEmitter::runTests(raw_ostream &OS) {
           (void)ClassifyType(TypeVec[srcti], inQuad, dummy, dummy);
           if (srcti == ti || inQuad != outQuad)
             continue;
-          OS << GenTest(name, Proto, TypeVec[ti], TypeVec[srcti],
-                        isShift, isHiddenLOp, ck, InstName);
+		  std::string testFuncProto;
+          std::string s = GenTest(name, Proto, TypeVec[ti], TypeVec[srcti],
+                                  isShift, isHiddenLOp, ck, InstName, isA64,
+								  testFuncProto);
+          if (EmittedMap.count(testFuncProto))
+            continue;
+          EmittedMap[testFuncProto] = kind;
+          OS << s << "\n";
         }
       } else {
-        OS << GenTest(name, Proto, TypeVec[ti], TypeVec[ti],
-                      isShift, isHiddenLOp, ck, InstName);
+		std::string testFuncProto;
+        std::string s = GenTest(name, Proto, TypeVec[ti], TypeVec[ti], isShift,
+                                isHiddenLOp, ck, InstName, isA64, testFuncProto);
+        if (EmittedMap.count(testFuncProto))
+          continue;
+        EmittedMap[testFuncProto] = kind;
+        OS << s << "\n";
       }
     }
-    OS << "\n";
   }
+
+  if (isA64GenTest)
+	OS << "#endif\n";
+}
+/// runTests - Write out a complete set of tests for all of the Neon
+/// intrinsics.
+void NeonEmitter::runTests(raw_ostream &OS) {
+  OS << "// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi "
+        "apcs-gnu\\\n"
+        "// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\\\n"
+        "// RUN:  | FileCheck %s -check-prefix=CHECK_ARM\n"
+		"\n"
+	    "// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \\\n"
+	    "// RUN -target-feature +neon  -ffreestanding -S -o - %s \\\n"
+	    "// RUN:  | FileCheck %s -check-prefix=CHECK_AARCH64\n"
+        "\n"
+        "// REQUIRES: long_tests\n"
+        "\n"
+        "#include <arm_neon.h>\n"
+        "\n";
+
+  // ARM tests must be emitted before AArch64 tests to ensure
+  // tests for intrinsics that are common to ARM and AArch64
+  // appear only once in the output stream.
+  // The check for uniqueness is done in genTargetTest.
+  StringMap<OpKind> EmittedMap;
+
+  genTargetTest(OS, EmittedMap, false);
+
+  genTargetTest(OS, EmittedMap, true);
 }
 
 namespace clang {