diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp')
-rw-r--r-- | contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp | 6219 |
1 files changed, 3512 insertions, 2707 deletions
diff --git a/contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp b/contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp index 7726ad3..4f68b34 100644 --- a/contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp +++ b/contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp @@ -212,11 +212,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, default: break; // Handle intrinsics and libm functions below. case Builtin::BI__builtin___CFStringMakeConstantString: case Builtin::BI__builtin___NSStringMakeConstantString: - return RValue::get(CGM.EmitConstantExpr(E, E->getType(), 0)); + return RValue::get(CGM.EmitConstantExpr(E, E->getType(), nullptr)); case Builtin::BI__builtin_stdarg_start: case Builtin::BI__builtin_va_start: + case Builtin::BI__va_start: case Builtin::BI__builtin_va_end: { - Value *ArgValue = EmitVAListRef(E->getArg(0)); + Value *ArgValue = (BuiltinID == Builtin::BI__va_start) + ? EmitScalarExpr(E->getArg(0)) + : EmitVAListRef(E->getArg(0)); llvm::Type *DestType = Int8PtrTy; if (ArgValue->getType() != DestType) ArgValue = Builder.CreateBitCast(ArgValue, DestType, @@ -429,6 +432,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Value *F = CGM.getIntrinsic(Intrinsic::readcyclecounter); return RValue::get(Builder.CreateCall(F)); } + case Builtin::BI__builtin___clear_cache: { + Value *Begin = EmitScalarExpr(E->getArg(0)); + Value *End = EmitScalarExpr(E->getArg(1)); + Value *F = CGM.getIntrinsic(Intrinsic::clear_cache); + return RValue::get(Builder.CreateCall2(F, Begin, End)); + } case Builtin::BI__builtin_trap: { Value *F = CGM.getIntrinsic(Intrinsic::trap); return RValue::get(Builder.CreateCall(F)); @@ -438,17 +447,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, return RValue::get(Builder.CreateCall(F)); } case Builtin::BI__builtin_unreachable: { - if (SanOpts->Unreachable) + if (SanOpts->Unreachable) { + SanitizerScope SanScope(this); EmitCheck(Builder.getFalse(), "builtin_unreachable", EmitCheckSourceLocation(E->getExprLoc()), ArrayRef<llvm::Value *>(), CRK_Unrecoverable); - else + } else Builder.CreateUnreachable(); // We do need to preserve an insertion point. EmitBlock(createBasicBlock("unreachable.cont")); - return RValue::get(0); + return RValue::get(nullptr); } case Builtin::BI__builtin_powi: @@ -781,7 +791,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, // We do need to preserve an insertion point. EmitBlock(createBasicBlock("builtin_eh_return.cont")); - return RValue::get(0); + return RValue::get(nullptr); } case Builtin::BI__builtin_unwind_init: { Value *F = CGM.getIntrinsic(Intrinsic::eh_unwind_init); @@ -847,7 +857,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, // We do need to preserve an insertion point. EmitBlock(createBasicBlock("longjmp.cont")); - return RValue::get(0); + return RValue::get(nullptr); } case Builtin::BI__sync_fetch_and_add: case Builtin::BI__sync_fetch_and_sub: @@ -964,7 +974,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType); Value *Result = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2], + llvm::SequentiallyConsistent, llvm::SequentiallyConsistent); + Result = Builder.CreateExtractValue(Result, 0); Result = EmitFromInt(*this, Result, T, ValueType); return RValue::get(Result); } @@ -988,10 +1000,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Args[1] = EmitToInt(*this, EmitScalarExpr(E->getArg(1)), T, IntType); Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType); - Value *OldVal = Args[1]; - Value *PrevVal = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2], - llvm::SequentiallyConsistent); - Value *Result = Builder.CreateICmpEQ(PrevVal, OldVal); + Value *Pair = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2], + llvm::SequentiallyConsistent, + llvm::SequentiallyConsistent); + Value *Result = Builder.CreateExtractValue(Pair, 1); // zext bool to int. Result = Builder.CreateZExt(Result, ConvertType(E->getType())); return RValue::get(Result); @@ -1026,7 +1038,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Builder.CreateStore(llvm::Constant::getNullValue(ITy), Ptr); Store->setAlignment(StoreSize.getQuantity()); Store->setAtomic(llvm::Release); - return RValue::get(0); + return RValue::get(nullptr); } case Builtin::BI__sync_synchronize: { @@ -1038,7 +1050,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, // to use it with non-atomic loads and stores to get acquire/release // semantics. Builder.CreateFence(llvm::SequentiallyConsistent); - return RValue::get(0); + return RValue::get(nullptr); } case Builtin::BI__c11_atomic_is_lock_free: @@ -1079,7 +1091,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Value *Order = EmitScalarExpr(E->getArg(1)); if (isa<llvm::ConstantInt>(Order)) { int ord = cast<llvm::ConstantInt>(Order)->getZExtValue(); - AtomicRMWInst *Result = 0; + AtomicRMWInst *Result = nullptr; switch (ord) { case 0: // memory_order_relaxed default: // invalid order @@ -1179,7 +1191,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Store->setOrdering(llvm::SequentiallyConsistent); break; } - return RValue::get(0); + return RValue::get(nullptr); } llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn); @@ -1209,7 +1221,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, SI->addCase(Builder.getInt32(5), BBs[2]); Builder.SetInsertPoint(ContBB); - return RValue::get(0); + return RValue::get(nullptr); } case Builtin::BI__atomic_thread_fence: @@ -1243,7 +1255,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Builder.CreateFence(llvm::SequentiallyConsistent, Scope); break; } - return RValue::get(0); + return RValue::get(nullptr); } llvm::BasicBlock *AcquireBB, *ReleaseBB, *AcqRelBB, *SeqCstBB; @@ -1278,7 +1290,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, SI->addCase(Builder.getInt32(5), SeqCstBB); Builder.SetInsertPoint(ContBB); - return RValue::get(0); + return RValue::get(nullptr); } // Library functions with special handling. @@ -1311,7 +1323,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, llvm::Type *ArgType = Base->getType(); Value *F = CGM.getIntrinsic(Intrinsic::pow, ArgType); return RValue::get(Builder.CreateCall2(F, Base, Exponent)); - break; } case Builtin::BIfma: @@ -1498,8 +1509,84 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, } case Builtin::BI__builtin_addressof: return RValue::get(EmitLValue(E->getArg(0)).getAddress()); + case Builtin::BI__builtin_operator_new: + return EmitBuiltinNewDeleteCall(FD->getType()->castAs<FunctionProtoType>(), + E->getArg(0), false); + case Builtin::BI__builtin_operator_delete: + return EmitBuiltinNewDeleteCall(FD->getType()->castAs<FunctionProtoType>(), + E->getArg(0), true); case Builtin::BI__noop: - return RValue::get(0); + // __noop always evaluates to an integer literal zero. + return RValue::get(ConstantInt::get(IntTy, 0)); + case Builtin::BI__assume: + // Until LLVM supports assumptions at the IR level, this becomes nothing. + return RValue::get(nullptr); + case Builtin::BI_InterlockedExchange: + case Builtin::BI_InterlockedExchangePointer: + return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E); + case Builtin::BI_InterlockedCompareExchangePointer: { + llvm::Type *RTy; + llvm::IntegerType *IntType = + IntegerType::get(getLLVMContext(), + getContext().getTypeSize(E->getType())); + llvm::Type *IntPtrType = IntType->getPointerTo(); + + llvm::Value *Destination = + Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), IntPtrType); + + llvm::Value *Exchange = EmitScalarExpr(E->getArg(1)); + RTy = Exchange->getType(); + Exchange = Builder.CreatePtrToInt(Exchange, IntType); + + llvm::Value *Comparand = + Builder.CreatePtrToInt(EmitScalarExpr(E->getArg(2)), IntType); + + auto Result = Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange, + SequentiallyConsistent, + SequentiallyConsistent); + Result->setVolatile(true); + + return RValue::get(Builder.CreateIntToPtr(Builder.CreateExtractValue(Result, + 0), + RTy)); + } + case Builtin::BI_InterlockedCompareExchange: { + AtomicCmpXchgInst *CXI = Builder.CreateAtomicCmpXchg( + EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(1)), + SequentiallyConsistent, + SequentiallyConsistent); + CXI->setVolatile(true); + return RValue::get(Builder.CreateExtractValue(CXI, 0)); + } + case Builtin::BI_InterlockedIncrement: { + AtomicRMWInst *RMWI = Builder.CreateAtomicRMW( + AtomicRMWInst::Add, + EmitScalarExpr(E->getArg(0)), + ConstantInt::get(Int32Ty, 1), + llvm::SequentiallyConsistent); + RMWI->setVolatile(true); + return RValue::get(Builder.CreateAdd(RMWI, ConstantInt::get(Int32Ty, 1))); + } + case Builtin::BI_InterlockedDecrement: { + AtomicRMWInst *RMWI = Builder.CreateAtomicRMW( + AtomicRMWInst::Sub, + EmitScalarExpr(E->getArg(0)), + ConstantInt::get(Int32Ty, 1), + llvm::SequentiallyConsistent); + RMWI->setVolatile(true); + return RValue::get(Builder.CreateSub(RMWI, ConstantInt::get(Int32Ty, 1))); + } + case Builtin::BI_InterlockedExchangeAdd: { + AtomicRMWInst *RMWI = Builder.CreateAtomicRMW( + AtomicRMWInst::Add, + EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), + llvm::SequentiallyConsistent); + RMWI->setVolatile(true); + return RValue::get(RMWI); + } } // If this is an alias for a lib function (e.g. __builtin_sin), emit @@ -1518,8 +1605,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, const char *Name = getContext().BuiltinInfo.GetName(BuiltinID); Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic; if (const char *Prefix = - llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch())) + llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch())) { IntrinsicID = Intrinsic::getIntrinsicForGCCBuiltin(Prefix, Name); + // NOTE we dont need to perform a compatibility flag check here since the + // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the + // MS builtins via ALL_MS_LANGUAGES and are filtered earlier. + if (IntrinsicID == Intrinsic::not_intrinsic) + IntrinsicID = Intrinsic::getIntrinsicForMSBuiltin(Prefix, Name); + } if (IntrinsicID != Intrinsic::not_intrinsic) { SmallVector<Value*, 16> Args; @@ -1590,11 +1683,16 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { switch (getTarget().getTriple().getArch()) { - case llvm::Triple::aarch64: - return EmitAArch64BuiltinExpr(BuiltinID, E); case llvm::Triple::arm: + case llvm::Triple::armeb: case llvm::Triple::thumb: + case llvm::Triple::thumbeb: return EmitARMBuiltinExpr(BuiltinID, E); + case llvm::Triple::aarch64: + case llvm::Triple::aarch64_be: + case llvm::Triple::arm64: + case llvm::Triple::arm64_be: + return EmitAArch64BuiltinExpr(BuiltinID, E); case llvm::Triple::x86: case llvm::Triple::x86_64: return EmitX86BuiltinExpr(BuiltinID, E); @@ -1602,8 +1700,10 @@ Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID, case llvm::Triple::ppc64: case llvm::Triple::ppc64le: return EmitPPCBuiltinExpr(BuiltinID, E); + case llvm::Triple::r600: + return EmitR600BuiltinExpr(BuiltinID, E); default: - return 0; + return nullptr; } } @@ -1624,6 +1724,11 @@ static llvm::VectorType *GetNeonType(CodeGenFunction *CGF, case NeonTypeFlags::Int64: case NeonTypeFlags::Poly64: return llvm::VectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad)); + case NeonTypeFlags::Poly128: + // FIXME: i128 and f128 doesn't get fully support in Clang and llvm. + // There is a lot of i128 and f128 API missing. + // so we use v16i8 to represent poly128 and get pattern matched. + return llvm::VectorType::get(CGF->Int8Ty, 16); case NeonTypeFlags::Float32: return llvm::VectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad)); case NeonTypeFlags::Float64: @@ -1749,2182 +1854,1232 @@ CodeGenFunction::EmitPointerWithAlignment(const Expr *Addr) { return std::make_pair(EmitScalarExpr(Addr), Align); } -static Value *EmitAArch64ScalarBuiltinExpr(CodeGenFunction &CGF, - unsigned BuiltinID, - const CallExpr *E) { - unsigned int Int = 0; - // Scalar result generated across vectors - bool AcrossVec = false; - // Extend element of one-element vector - bool ExtendEle = false; - bool OverloadInt = false; - bool OverloadCmpInt = false; - bool IsFpCmpZInt = false; - bool OverloadCvtInt = false; - bool OverloadWideInt = false; - bool OverloadNarrowInt = false; - const char *s = NULL; - - SmallVector<Value *, 4> Ops; - for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) { - Ops.push_back(CGF.EmitScalarExpr(E->getArg(i))); - } - - // AArch64 scalar builtins are not overloaded, they do not have an extra - // argument that specifies the vector type, need to handle each case. - switch (BuiltinID) { - default: break; - case AArch64::BI__builtin_neon_vdups_lane_f32: - case AArch64::BI__builtin_neon_vdupd_lane_f64: - case AArch64::BI__builtin_neon_vdups_laneq_f32: - case AArch64::BI__builtin_neon_vdupd_laneq_f64: { - return CGF.Builder.CreateExtractElement(Ops[0], Ops[1], "vdup_lane"); - } - case AArch64::BI__builtin_neon_vdupb_lane_i8: - case AArch64::BI__builtin_neon_vduph_lane_i16: - case AArch64::BI__builtin_neon_vdups_lane_i32: - case AArch64::BI__builtin_neon_vdupd_lane_i64: - case AArch64::BI__builtin_neon_vdupb_laneq_i8: - case AArch64::BI__builtin_neon_vduph_laneq_i16: - case AArch64::BI__builtin_neon_vdups_laneq_i32: - case AArch64::BI__builtin_neon_vdupd_laneq_i64: { - // The backend treats Neon scalar types as v1ix types - // So we want to dup lane from any vector to v1ix vector - // with shufflevector - s = "vdup_lane"; - Value* SV = llvm::ConstantVector::getSplat(1, cast<ConstantInt>(Ops[1])); - Value *Result = CGF.Builder.CreateShuffleVector(Ops[0], Ops[0], SV, s); - llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType()); - // AArch64 intrinsic one-element vector type cast to - // scalar type expected by the builtin - return CGF.Builder.CreateBitCast(Result, Ty, s); - } - case AArch64::BI__builtin_neon_vqdmlalh_lane_s16 : - case AArch64::BI__builtin_neon_vqdmlalh_laneq_s16 : - case AArch64::BI__builtin_neon_vqdmlals_lane_s32 : - case AArch64::BI__builtin_neon_vqdmlals_laneq_s32 : - case AArch64::BI__builtin_neon_vqdmlslh_lane_s16 : - case AArch64::BI__builtin_neon_vqdmlslh_laneq_s16 : - case AArch64::BI__builtin_neon_vqdmlsls_lane_s32 : - case AArch64::BI__builtin_neon_vqdmlsls_laneq_s32 : { - Int = Intrinsic::arm_neon_vqadds; - if (BuiltinID == AArch64::BI__builtin_neon_vqdmlslh_lane_s16 || - BuiltinID == AArch64::BI__builtin_neon_vqdmlslh_laneq_s16 || - BuiltinID == AArch64::BI__builtin_neon_vqdmlsls_lane_s32 || - BuiltinID == AArch64::BI__builtin_neon_vqdmlsls_laneq_s32) { - Int = Intrinsic::arm_neon_vqsubs; - } - // create vqdmull call with b * c[i] - llvm::Type *Ty = CGF.ConvertType(E->getArg(1)->getType()); - llvm::VectorType *OpVTy = llvm::VectorType::get(Ty, 1); - Ty = CGF.ConvertType(E->getArg(0)->getType()); - llvm::VectorType *ResVTy = llvm::VectorType::get(Ty, 1); - Value *F = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, ResVTy); - Value *V = UndefValue::get(OpVTy); - llvm::Constant *CI = ConstantInt::get(CGF.Int32Ty, 0); - SmallVector<Value *, 2> MulOps; - MulOps.push_back(Ops[1]); - MulOps.push_back(Ops[2]); - MulOps[0] = CGF.Builder.CreateInsertElement(V, MulOps[0], CI); - MulOps[1] = CGF.Builder.CreateExtractElement(MulOps[1], Ops[3], "extract"); - MulOps[1] = CGF.Builder.CreateInsertElement(V, MulOps[1], CI); - Value *MulRes = CGF.Builder.CreateCall2(F, MulOps[0], MulOps[1]); - // create vqadds call with a +/- vqdmull result - F = CGF.CGM.getIntrinsic(Int, ResVTy); - SmallVector<Value *, 2> AddOps; - AddOps.push_back(Ops[0]); - AddOps.push_back(MulRes); - V = UndefValue::get(ResVTy); - AddOps[0] = CGF.Builder.CreateInsertElement(V, AddOps[0], CI); - Value *AddRes = CGF.Builder.CreateCall2(F, AddOps[0], AddOps[1]); - return CGF.Builder.CreateBitCast(AddRes, Ty); - } - case AArch64::BI__builtin_neon_vfmas_lane_f32: - case AArch64::BI__builtin_neon_vfmas_laneq_f32: - case AArch64::BI__builtin_neon_vfmad_lane_f64: - case AArch64::BI__builtin_neon_vfmad_laneq_f64: { - llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType()); - Value *F = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty); - Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); - return CGF.Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]); - } - // Scalar Floating-point Multiply Extended - case AArch64::BI__builtin_neon_vmulxs_f32: - case AArch64::BI__builtin_neon_vmulxd_f64: { - Int = Intrinsic::aarch64_neon_vmulx; - llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType()); - return CGF.EmitNeonCall(CGF.CGM.getIntrinsic(Int, Ty), Ops, "vmulx"); - } - case AArch64::BI__builtin_neon_vmul_n_f64: { - // v1f64 vmul_n_f64 should be mapped to Neon scalar mul lane - llvm::Type *VTy = GetNeonType(&CGF, - NeonTypeFlags(NeonTypeFlags::Float64, false, false)); - Ops[0] = CGF.Builder.CreateBitCast(Ops[0], VTy); - llvm::Value *Idx = llvm::ConstantInt::get(CGF.Int32Ty, 0); - Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], Idx, "extract"); - Value *Result = CGF.Builder.CreateFMul(Ops[0], Ops[1]); - return CGF.Builder.CreateBitCast(Result, VTy); - } - case AArch64::BI__builtin_neon_vget_lane_i8: - case AArch64::BI__builtin_neon_vget_lane_i16: - case AArch64::BI__builtin_neon_vget_lane_i32: - case AArch64::BI__builtin_neon_vget_lane_i64: - case AArch64::BI__builtin_neon_vget_lane_f32: - case AArch64::BI__builtin_neon_vget_lane_f64: - case AArch64::BI__builtin_neon_vgetq_lane_i8: - case AArch64::BI__builtin_neon_vgetq_lane_i16: - case AArch64::BI__builtin_neon_vgetq_lane_i32: - case AArch64::BI__builtin_neon_vgetq_lane_i64: - case AArch64::BI__builtin_neon_vgetq_lane_f32: - case AArch64::BI__builtin_neon_vgetq_lane_f64: - return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vget_lane_i8, E); - case AArch64::BI__builtin_neon_vset_lane_i8: - case AArch64::BI__builtin_neon_vset_lane_i16: - case AArch64::BI__builtin_neon_vset_lane_i32: - case AArch64::BI__builtin_neon_vset_lane_i64: - case AArch64::BI__builtin_neon_vset_lane_f32: - case AArch64::BI__builtin_neon_vset_lane_f64: - case AArch64::BI__builtin_neon_vsetq_lane_i8: - case AArch64::BI__builtin_neon_vsetq_lane_i16: - case AArch64::BI__builtin_neon_vsetq_lane_i32: - case AArch64::BI__builtin_neon_vsetq_lane_i64: - case AArch64::BI__builtin_neon_vsetq_lane_f32: - case AArch64::BI__builtin_neon_vsetq_lane_f64: - return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vset_lane_i8, E); - // Crypto - case AArch64::BI__builtin_neon_vsha1h_u32: - Int = Intrinsic::arm_neon_sha1h; - s = "sha1h"; OverloadInt = true; break; - case AArch64::BI__builtin_neon_vsha1cq_u32: - Int = Intrinsic::aarch64_neon_sha1c; - s = "sha1c"; break; - case AArch64::BI__builtin_neon_vsha1pq_u32: - Int = Intrinsic::aarch64_neon_sha1p; - s = "sha1p"; break; - case AArch64::BI__builtin_neon_vsha1mq_u32: - Int = Intrinsic::aarch64_neon_sha1m; - s = "sha1m"; break; - // Scalar Add - case AArch64::BI__builtin_neon_vaddd_s64: - Int = Intrinsic::aarch64_neon_vaddds; - s = "vaddds"; break; - case AArch64::BI__builtin_neon_vaddd_u64: - Int = Intrinsic::aarch64_neon_vadddu; - s = "vadddu"; break; - // Scalar Sub - case AArch64::BI__builtin_neon_vsubd_s64: - Int = Intrinsic::aarch64_neon_vsubds; - s = "vsubds"; break; - case AArch64::BI__builtin_neon_vsubd_u64: - Int = Intrinsic::aarch64_neon_vsubdu; - s = "vsubdu"; break; - // Scalar Saturating Add - case AArch64::BI__builtin_neon_vqaddb_s8: - case AArch64::BI__builtin_neon_vqaddh_s16: - case AArch64::BI__builtin_neon_vqadds_s32: - case AArch64::BI__builtin_neon_vqaddd_s64: - Int = Intrinsic::arm_neon_vqadds; - s = "vqadds"; OverloadInt = true; break; - case AArch64::BI__builtin_neon_vqaddb_u8: - case AArch64::BI__builtin_neon_vqaddh_u16: - case AArch64::BI__builtin_neon_vqadds_u32: - case AArch64::BI__builtin_neon_vqaddd_u64: - Int = Intrinsic::arm_neon_vqaddu; - s = "vqaddu"; OverloadInt = true; break; - // Scalar Saturating Sub - case AArch64::BI__builtin_neon_vqsubb_s8: - case AArch64::BI__builtin_neon_vqsubh_s16: - case AArch64::BI__builtin_neon_vqsubs_s32: - case AArch64::BI__builtin_neon_vqsubd_s64: - Int = Intrinsic::arm_neon_vqsubs; - s = "vqsubs"; OverloadInt = true; break; - case AArch64::BI__builtin_neon_vqsubb_u8: - case AArch64::BI__builtin_neon_vqsubh_u16: - case AArch64::BI__builtin_neon_vqsubs_u32: - case AArch64::BI__builtin_neon_vqsubd_u64: - Int = Intrinsic::arm_neon_vqsubu; - s = "vqsubu"; OverloadInt = true; break; - // Scalar Shift Left - case AArch64::BI__builtin_neon_vshld_s64: - Int = Intrinsic::aarch64_neon_vshlds; - s = "vshlds"; break; - case AArch64::BI__builtin_neon_vshld_u64: - Int = Intrinsic::aarch64_neon_vshldu; - s = "vshldu"; break; - // Scalar Saturating Shift Left - case AArch64::BI__builtin_neon_vqshlb_s8: - case AArch64::BI__builtin_neon_vqshlh_s16: - case AArch64::BI__builtin_neon_vqshls_s32: - case AArch64::BI__builtin_neon_vqshld_s64: - Int = Intrinsic::aarch64_neon_vqshls; - s = "vqshls"; OverloadInt = true; break; - case AArch64::BI__builtin_neon_vqshlb_u8: - case AArch64::BI__builtin_neon_vqshlh_u16: - case AArch64::BI__builtin_neon_vqshls_u32: - case AArch64::BI__builtin_neon_vqshld_u64: - Int = Intrinsic::aarch64_neon_vqshlu; - s = "vqshlu"; OverloadInt = true; break; - // Scalar Rouding Shift Left - case AArch64::BI__builtin_neon_vrshld_s64: - Int = Intrinsic::aarch64_neon_vrshlds; - s = "vrshlds"; break; - case AArch64::BI__builtin_neon_vrshld_u64: - Int = Intrinsic::aarch64_neon_vrshldu; - s = "vrshldu"; break; - // Scalar Saturating Rouding Shift Left - case AArch64::BI__builtin_neon_vqrshlb_s8: - case AArch64::BI__builtin_neon_vqrshlh_s16: - case AArch64::BI__builtin_neon_vqrshls_s32: - case AArch64::BI__builtin_neon_vqrshld_s64: - Int = Intrinsic::aarch64_neon_vqrshls; - s = "vqrshls"; OverloadInt = true; break; - case AArch64::BI__builtin_neon_vqrshlb_u8: - case AArch64::BI__builtin_neon_vqrshlh_u16: - case AArch64::BI__builtin_neon_vqrshls_u32: - case AArch64::BI__builtin_neon_vqrshld_u64: - Int = Intrinsic::aarch64_neon_vqrshlu; - s = "vqrshlu"; OverloadInt = true; break; - // Scalar Reduce Pairwise Add - case AArch64::BI__builtin_neon_vpaddd_s64: - case AArch64::BI__builtin_neon_vpaddd_u64: - Int = Intrinsic::aarch64_neon_vpadd; s = "vpadd"; - break; - case AArch64::BI__builtin_neon_vpadds_f32: - Int = Intrinsic::aarch64_neon_vpfadd; s = "vpfadd"; - break; - case AArch64::BI__builtin_neon_vpaddd_f64: - Int = Intrinsic::aarch64_neon_vpfaddq; s = "vpfaddq"; - break; - // Scalar Reduce Pairwise Floating Point Max - case AArch64::BI__builtin_neon_vpmaxs_f32: - Int = Intrinsic::aarch64_neon_vpmax; s = "vpmax"; - break; - case AArch64::BI__builtin_neon_vpmaxqd_f64: - Int = Intrinsic::aarch64_neon_vpmaxq; s = "vpmaxq"; - break; - // Scalar Reduce Pairwise Floating Point Min - case AArch64::BI__builtin_neon_vpmins_f32: - Int = Intrinsic::aarch64_neon_vpmin; s = "vpmin"; - break; - case AArch64::BI__builtin_neon_vpminqd_f64: - Int = Intrinsic::aarch64_neon_vpminq; s = "vpminq"; - break; - // Scalar Reduce Pairwise Floating Point Maxnm - case AArch64::BI__builtin_neon_vpmaxnms_f32: - Int = Intrinsic::aarch64_neon_vpfmaxnm; s = "vpfmaxnm"; - break; - case AArch64::BI__builtin_neon_vpmaxnmqd_f64: - Int = Intrinsic::aarch64_neon_vpfmaxnmq; s = "vpfmaxnmq"; - break; - // Scalar Reduce Pairwise Floating Point Minnm - case AArch64::BI__builtin_neon_vpminnms_f32: - Int = Intrinsic::aarch64_neon_vpfminnm; s = "vpfminnm"; - break; - case AArch64::BI__builtin_neon_vpminnmqd_f64: - Int = Intrinsic::aarch64_neon_vpfminnmq; s = "vpfminnmq"; - break; - // The followings are intrinsics with scalar results generated AcrossVec vectors - case AArch64::BI__builtin_neon_vaddlv_s8: - case AArch64::BI__builtin_neon_vaddlv_s16: - case AArch64::BI__builtin_neon_vaddlvq_s8: - case AArch64::BI__builtin_neon_vaddlvq_s16: - case AArch64::BI__builtin_neon_vaddlvq_s32: - Int = Intrinsic::aarch64_neon_saddlv; - AcrossVec = true; ExtendEle = true; s = "saddlv"; break; - case AArch64::BI__builtin_neon_vaddlv_u8: - case AArch64::BI__builtin_neon_vaddlv_u16: - case AArch64::BI__builtin_neon_vaddlvq_u8: - case AArch64::BI__builtin_neon_vaddlvq_u16: - case AArch64::BI__builtin_neon_vaddlvq_u32: - Int = Intrinsic::aarch64_neon_uaddlv; - AcrossVec = true; ExtendEle = true; s = "uaddlv"; break; - case AArch64::BI__builtin_neon_vmaxv_s8: - case AArch64::BI__builtin_neon_vmaxv_s16: - case AArch64::BI__builtin_neon_vmaxvq_s8: - case AArch64::BI__builtin_neon_vmaxvq_s16: - case AArch64::BI__builtin_neon_vmaxvq_s32: - Int = Intrinsic::aarch64_neon_smaxv; - AcrossVec = true; ExtendEle = false; s = "smaxv"; break; - case AArch64::BI__builtin_neon_vmaxv_u8: - case AArch64::BI__builtin_neon_vmaxv_u16: - case AArch64::BI__builtin_neon_vmaxvq_u8: - case AArch64::BI__builtin_neon_vmaxvq_u16: - case AArch64::BI__builtin_neon_vmaxvq_u32: - Int = Intrinsic::aarch64_neon_umaxv; - AcrossVec = true; ExtendEle = false; s = "umaxv"; break; - case AArch64::BI__builtin_neon_vminv_s8: - case AArch64::BI__builtin_neon_vminv_s16: - case AArch64::BI__builtin_neon_vminvq_s8: - case AArch64::BI__builtin_neon_vminvq_s16: - case AArch64::BI__builtin_neon_vminvq_s32: - Int = Intrinsic::aarch64_neon_sminv; - AcrossVec = true; ExtendEle = false; s = "sminv"; break; - case AArch64::BI__builtin_neon_vminv_u8: - case AArch64::BI__builtin_neon_vminv_u16: - case AArch64::BI__builtin_neon_vminvq_u8: - case AArch64::BI__builtin_neon_vminvq_u16: - case AArch64::BI__builtin_neon_vminvq_u32: - Int = Intrinsic::aarch64_neon_uminv; - AcrossVec = true; ExtendEle = false; s = "uminv"; break; - case AArch64::BI__builtin_neon_vaddv_s8: - case AArch64::BI__builtin_neon_vaddv_s16: - case AArch64::BI__builtin_neon_vaddvq_s8: - case AArch64::BI__builtin_neon_vaddvq_s16: - case AArch64::BI__builtin_neon_vaddvq_s32: - case AArch64::BI__builtin_neon_vaddvq_s64: - case AArch64::BI__builtin_neon_vaddv_u8: - case AArch64::BI__builtin_neon_vaddv_u16: - case AArch64::BI__builtin_neon_vaddvq_u8: - case AArch64::BI__builtin_neon_vaddvq_u16: - case AArch64::BI__builtin_neon_vaddvq_u32: - case AArch64::BI__builtin_neon_vaddvq_u64: - case AArch64::BI__builtin_neon_vaddv_f32: - case AArch64::BI__builtin_neon_vaddvq_f32: - case AArch64::BI__builtin_neon_vaddvq_f64: - Int = Intrinsic::aarch64_neon_vaddv; - AcrossVec = true; ExtendEle = false; s = "vaddv"; break; - case AArch64::BI__builtin_neon_vmaxv_f32: - case AArch64::BI__builtin_neon_vmaxvq_f32: - case AArch64::BI__builtin_neon_vmaxvq_f64: - Int = Intrinsic::aarch64_neon_vmaxv; - AcrossVec = true; ExtendEle = false; s = "vmaxv"; break; - case AArch64::BI__builtin_neon_vminv_f32: - case AArch64::BI__builtin_neon_vminvq_f32: - case AArch64::BI__builtin_neon_vminvq_f64: - Int = Intrinsic::aarch64_neon_vminv; - AcrossVec = true; ExtendEle = false; s = "vminv"; break; - case AArch64::BI__builtin_neon_vmaxnmv_f32: - case AArch64::BI__builtin_neon_vmaxnmvq_f32: - case AArch64::BI__builtin_neon_vmaxnmvq_f64: - Int = Intrinsic::aarch64_neon_vmaxnmv; - AcrossVec = true; ExtendEle = false; s = "vmaxnmv"; break; - case AArch64::BI__builtin_neon_vminnmv_f32: - case AArch64::BI__builtin_neon_vminnmvq_f32: - case AArch64::BI__builtin_neon_vminnmvq_f64: - Int = Intrinsic::aarch64_neon_vminnmv; - AcrossVec = true; ExtendEle = false; s = "vminnmv"; break; - // Scalar Integer Saturating Doubling Multiply Half High - case AArch64::BI__builtin_neon_vqdmulhh_s16: - case AArch64::BI__builtin_neon_vqdmulhs_s32: - Int = Intrinsic::arm_neon_vqdmulh; - s = "vqdmulh"; OverloadInt = true; break; - // Scalar Integer Saturating Rounding Doubling Multiply Half High - case AArch64::BI__builtin_neon_vqrdmulhh_s16: - case AArch64::BI__builtin_neon_vqrdmulhs_s32: - Int = Intrinsic::arm_neon_vqrdmulh; - s = "vqrdmulh"; OverloadInt = true; break; - // Scalar Floating-point Reciprocal Step and - case AArch64::BI__builtin_neon_vrecpss_f32: - case AArch64::BI__builtin_neon_vrecpsd_f64: - Int = Intrinsic::arm_neon_vrecps; - s = "vrecps"; OverloadInt = true; break; - // Scalar Floating-point Reciprocal Square Root Step - case AArch64::BI__builtin_neon_vrsqrtss_f32: - case AArch64::BI__builtin_neon_vrsqrtsd_f64: - Int = Intrinsic::arm_neon_vrsqrts; - s = "vrsqrts"; OverloadInt = true; break; - // Scalar Signed Integer Convert To Floating-point - case AArch64::BI__builtin_neon_vcvts_f32_s32: - Int = Intrinsic::aarch64_neon_vcvtf32_s32, - s = "vcvtf"; OverloadInt = false; break; - case AArch64::BI__builtin_neon_vcvtd_f64_s64: - Int = Intrinsic::aarch64_neon_vcvtf64_s64, - s = "vcvtf"; OverloadInt = false; break; - // Scalar Unsigned Integer Convert To Floating-point - case AArch64::BI__builtin_neon_vcvts_f32_u32: - Int = Intrinsic::aarch64_neon_vcvtf32_u32, - s = "vcvtf"; OverloadInt = false; break; - case AArch64::BI__builtin_neon_vcvtd_f64_u64: - Int = Intrinsic::aarch64_neon_vcvtf64_u64, - s = "vcvtf"; OverloadInt = false; break; - // Scalar Floating-point Converts - case AArch64::BI__builtin_neon_vcvtxd_f32_f64: - Int = Intrinsic::aarch64_neon_fcvtxn; - s = "vcvtxn"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvtas_s32_f32: - case AArch64::BI__builtin_neon_vcvtad_s64_f64: - Int = Intrinsic::aarch64_neon_fcvtas; - s = "vcvtas"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvtas_u32_f32: - case AArch64::BI__builtin_neon_vcvtad_u64_f64: - Int = Intrinsic::aarch64_neon_fcvtau; - s = "vcvtau"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvtms_s32_f32: - case AArch64::BI__builtin_neon_vcvtmd_s64_f64: - Int = Intrinsic::aarch64_neon_fcvtms; - s = "vcvtms"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvtms_u32_f32: - case AArch64::BI__builtin_neon_vcvtmd_u64_f64: - Int = Intrinsic::aarch64_neon_fcvtmu; - s = "vcvtmu"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvtns_s32_f32: - case AArch64::BI__builtin_neon_vcvtnd_s64_f64: - Int = Intrinsic::aarch64_neon_fcvtns; - s = "vcvtns"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvtns_u32_f32: - case AArch64::BI__builtin_neon_vcvtnd_u64_f64: - Int = Intrinsic::aarch64_neon_fcvtnu; - s = "vcvtnu"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvtps_s32_f32: - case AArch64::BI__builtin_neon_vcvtpd_s64_f64: - Int = Intrinsic::aarch64_neon_fcvtps; - s = "vcvtps"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvtps_u32_f32: - case AArch64::BI__builtin_neon_vcvtpd_u64_f64: - Int = Intrinsic::aarch64_neon_fcvtpu; - s = "vcvtpu"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvts_s32_f32: - case AArch64::BI__builtin_neon_vcvtd_s64_f64: - Int = Intrinsic::aarch64_neon_fcvtzs; - s = "vcvtzs"; OverloadCvtInt = true; break; - case AArch64::BI__builtin_neon_vcvts_u32_f32: - case AArch64::BI__builtin_neon_vcvtd_u64_f64: - Int = Intrinsic::aarch64_neon_fcvtzu; - s = "vcvtzu"; OverloadCvtInt = true; break; - // Scalar Floating-point Reciprocal Estimate - case AArch64::BI__builtin_neon_vrecpes_f32: - case AArch64::BI__builtin_neon_vrecped_f64: - Int = Intrinsic::arm_neon_vrecpe; - s = "vrecpe"; OverloadInt = true; break; - // Scalar Floating-point Reciprocal Exponent - case AArch64::BI__builtin_neon_vrecpxs_f32: - case AArch64::BI__builtin_neon_vrecpxd_f64: - Int = Intrinsic::aarch64_neon_vrecpx; - s = "vrecpx"; OverloadInt = true; break; - // Scalar Floating-point Reciprocal Square Root Estimate - case AArch64::BI__builtin_neon_vrsqrtes_f32: - case AArch64::BI__builtin_neon_vrsqrted_f64: - Int = Intrinsic::arm_neon_vrsqrte; - s = "vrsqrte"; OverloadInt = true; break; - // Scalar Compare Equal - case AArch64::BI__builtin_neon_vceqd_s64: - case AArch64::BI__builtin_neon_vceqd_u64: - Int = Intrinsic::aarch64_neon_vceq; s = "vceq"; - OverloadCmpInt = true; break; - // Scalar Compare Equal To Zero - case AArch64::BI__builtin_neon_vceqzd_s64: - case AArch64::BI__builtin_neon_vceqzd_u64: - Int = Intrinsic::aarch64_neon_vceq; s = "vceq"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType())); - OverloadCmpInt = true; break; - // Scalar Compare Greater Than or Equal - case AArch64::BI__builtin_neon_vcged_s64: - Int = Intrinsic::aarch64_neon_vcge; s = "vcge"; - OverloadCmpInt = true; break; - case AArch64::BI__builtin_neon_vcged_u64: - Int = Intrinsic::aarch64_neon_vchs; s = "vcge"; - OverloadCmpInt = true; break; - // Scalar Compare Greater Than or Equal To Zero - case AArch64::BI__builtin_neon_vcgezd_s64: - Int = Intrinsic::aarch64_neon_vcge; s = "vcge"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType())); - OverloadCmpInt = true; break; - // Scalar Compare Greater Than - case AArch64::BI__builtin_neon_vcgtd_s64: - Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt"; - OverloadCmpInt = true; break; - case AArch64::BI__builtin_neon_vcgtd_u64: - Int = Intrinsic::aarch64_neon_vchi; s = "vcgt"; - OverloadCmpInt = true; break; - // Scalar Compare Greater Than Zero - case AArch64::BI__builtin_neon_vcgtzd_s64: - Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType())); - OverloadCmpInt = true; break; - // Scalar Compare Less Than or Equal - case AArch64::BI__builtin_neon_vcled_s64: - Int = Intrinsic::aarch64_neon_vcge; s = "vcge"; - OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break; - case AArch64::BI__builtin_neon_vcled_u64: - Int = Intrinsic::aarch64_neon_vchs; s = "vchs"; - OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break; - // Scalar Compare Less Than or Equal To Zero - case AArch64::BI__builtin_neon_vclezd_s64: - Int = Intrinsic::aarch64_neon_vclez; s = "vcle"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType())); - OverloadCmpInt = true; break; - // Scalar Compare Less Than - case AArch64::BI__builtin_neon_vcltd_s64: - Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt"; - OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break; - case AArch64::BI__builtin_neon_vcltd_u64: - Int = Intrinsic::aarch64_neon_vchi; s = "vchi"; - OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break; - // Scalar Compare Less Than Zero - case AArch64::BI__builtin_neon_vcltzd_s64: - Int = Intrinsic::aarch64_neon_vcltz; s = "vclt"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType())); - OverloadCmpInt = true; break; - // Scalar Floating-point Compare Equal - case AArch64::BI__builtin_neon_vceqs_f32: - case AArch64::BI__builtin_neon_vceqd_f64: - Int = Intrinsic::aarch64_neon_vceq; s = "vceq"; - OverloadCmpInt = true; break; - // Scalar Floating-point Compare Equal To Zero - case AArch64::BI__builtin_neon_vceqzs_f32: - case AArch64::BI__builtin_neon_vceqzd_f64: - Int = Intrinsic::aarch64_neon_vceq; s = "vceq"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy)); - IsFpCmpZInt = true; - OverloadCmpInt = true; break; - // Scalar Floating-point Compare Greater Than Or Equal - case AArch64::BI__builtin_neon_vcges_f32: - case AArch64::BI__builtin_neon_vcged_f64: - Int = Intrinsic::aarch64_neon_vcge; s = "vcge"; - OverloadCmpInt = true; break; - // Scalar Floating-point Compare Greater Than Or Equal To Zero - case AArch64::BI__builtin_neon_vcgezs_f32: - case AArch64::BI__builtin_neon_vcgezd_f64: - Int = Intrinsic::aarch64_neon_vcge; s = "vcge"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy)); - IsFpCmpZInt = true; - OverloadCmpInt = true; break; - // Scalar Floating-point Compare Greather Than - case AArch64::BI__builtin_neon_vcgts_f32: - case AArch64::BI__builtin_neon_vcgtd_f64: - Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt"; - OverloadCmpInt = true; break; - // Scalar Floating-point Compare Greather Than Zero - case AArch64::BI__builtin_neon_vcgtzs_f32: - case AArch64::BI__builtin_neon_vcgtzd_f64: - Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy)); - IsFpCmpZInt = true; - OverloadCmpInt = true; break; - // Scalar Floating-point Compare Less Than or Equal - case AArch64::BI__builtin_neon_vcles_f32: - case AArch64::BI__builtin_neon_vcled_f64: - Int = Intrinsic::aarch64_neon_vcge; s = "vcge"; - OverloadCmpInt = true; break; - // Scalar Floating-point Compare Less Than Or Equal To Zero - case AArch64::BI__builtin_neon_vclezs_f32: - case AArch64::BI__builtin_neon_vclezd_f64: - Int = Intrinsic::aarch64_neon_vclez; s = "vcle"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy)); - IsFpCmpZInt = true; - OverloadCmpInt = true; break; - // Scalar Floating-point Compare Less Than Zero - case AArch64::BI__builtin_neon_vclts_f32: - case AArch64::BI__builtin_neon_vcltd_f64: - Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt"; - OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break; - // Scalar Floating-point Compare Less Than Zero - case AArch64::BI__builtin_neon_vcltzs_f32: - case AArch64::BI__builtin_neon_vcltzd_f64: - Int = Intrinsic::aarch64_neon_vcltz; s = "vclt"; - // Add implicit zero operand. - Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy)); - IsFpCmpZInt = true; - OverloadCmpInt = true; break; - // Scalar Floating-point Absolute Compare Greater Than Or Equal - case AArch64::BI__builtin_neon_vcages_f32: - case AArch64::BI__builtin_neon_vcaged_f64: - Int = Intrinsic::aarch64_neon_vcage; s = "vcage"; - OverloadCmpInt = true; break; - // Scalar Floating-point Absolute Compare Greater Than - case AArch64::BI__builtin_neon_vcagts_f32: - case AArch64::BI__builtin_neon_vcagtd_f64: - Int = Intrinsic::aarch64_neon_vcagt; s = "vcagt"; - OverloadCmpInt = true; break; - // Scalar Floating-point Absolute Compare Less Than Or Equal - case AArch64::BI__builtin_neon_vcales_f32: - case AArch64::BI__builtin_neon_vcaled_f64: - Int = Intrinsic::aarch64_neon_vcage; s = "vcage"; - OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break; - // Scalar Floating-point Absolute Compare Less Than - case AArch64::BI__builtin_neon_vcalts_f32: - case AArch64::BI__builtin_neon_vcaltd_f64: - Int = Intrinsic::aarch64_neon_vcagt; s = "vcalt"; - OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break; - // Scalar Compare Bitwise Test Bits - case AArch64::BI__builtin_neon_vtstd_s64: - case AArch64::BI__builtin_neon_vtstd_u64: - Int = Intrinsic::aarch64_neon_vtstd; s = "vtst"; - OverloadCmpInt = true; break; - // Scalar Absolute Value - case AArch64::BI__builtin_neon_vabsd_s64: - Int = Intrinsic::aarch64_neon_vabs; - s = "vabs"; OverloadInt = false; break; - // Scalar Absolute Difference - case AArch64::BI__builtin_neon_vabds_f32: - case AArch64::BI__builtin_neon_vabdd_f64: - Int = Intrinsic::aarch64_neon_vabd; - s = "vabd"; OverloadInt = true; break; - // Scalar Signed Saturating Absolute Value - case AArch64::BI__builtin_neon_vqabsb_s8: - case AArch64::BI__builtin_neon_vqabsh_s16: - case AArch64::BI__builtin_neon_vqabss_s32: - case AArch64::BI__builtin_neon_vqabsd_s64: - Int = Intrinsic::arm_neon_vqabs; - s = "vqabs"; OverloadInt = true; break; - // Scalar Negate - case AArch64::BI__builtin_neon_vnegd_s64: - Int = Intrinsic::aarch64_neon_vneg; - s = "vneg"; OverloadInt = false; break; - // Scalar Signed Saturating Negate - case AArch64::BI__builtin_neon_vqnegb_s8: - case AArch64::BI__builtin_neon_vqnegh_s16: - case AArch64::BI__builtin_neon_vqnegs_s32: - case AArch64::BI__builtin_neon_vqnegd_s64: - Int = Intrinsic::arm_neon_vqneg; - s = "vqneg"; OverloadInt = true; break; - // Scalar Signed Saturating Accumulated of Unsigned Value - case AArch64::BI__builtin_neon_vuqaddb_s8: - case AArch64::BI__builtin_neon_vuqaddh_s16: - case AArch64::BI__builtin_neon_vuqadds_s32: - case AArch64::BI__builtin_neon_vuqaddd_s64: - Int = Intrinsic::aarch64_neon_vuqadd; - s = "vuqadd"; OverloadInt = true; break; - // Scalar Unsigned Saturating Accumulated of Signed Value - case AArch64::BI__builtin_neon_vsqaddb_u8: - case AArch64::BI__builtin_neon_vsqaddh_u16: - case AArch64::BI__builtin_neon_vsqadds_u32: - case AArch64::BI__builtin_neon_vsqaddd_u64: - Int = Intrinsic::aarch64_neon_vsqadd; - s = "vsqadd"; OverloadInt = true; break; - // Signed Saturating Doubling Multiply-Add Long - case AArch64::BI__builtin_neon_vqdmlalh_s16: - case AArch64::BI__builtin_neon_vqdmlals_s32: - Int = Intrinsic::aarch64_neon_vqdmlal; - s = "vqdmlal"; OverloadWideInt = true; break; - // Signed Saturating Doubling Multiply-Subtract Long - case AArch64::BI__builtin_neon_vqdmlslh_s16: - case AArch64::BI__builtin_neon_vqdmlsls_s32: - Int = Intrinsic::aarch64_neon_vqdmlsl; - s = "vqdmlsl"; OverloadWideInt = true; break; - // Signed Saturating Doubling Multiply Long - case AArch64::BI__builtin_neon_vqdmullh_s16: - case AArch64::BI__builtin_neon_vqdmulls_s32: - Int = Intrinsic::arm_neon_vqdmull; - s = "vqdmull"; OverloadWideInt = true; break; - // Scalar Signed Saturating Extract Unsigned Narrow - case AArch64::BI__builtin_neon_vqmovunh_s16: - case AArch64::BI__builtin_neon_vqmovuns_s32: - case AArch64::BI__builtin_neon_vqmovund_s64: - Int = Intrinsic::arm_neon_vqmovnsu; - s = "vqmovun"; OverloadNarrowInt = true; break; - // Scalar Signed Saturating Extract Narrow - case AArch64::BI__builtin_neon_vqmovnh_s16: - case AArch64::BI__builtin_neon_vqmovns_s32: - case AArch64::BI__builtin_neon_vqmovnd_s64: - Int = Intrinsic::arm_neon_vqmovns; - s = "vqmovn"; OverloadNarrowInt = true; break; - // Scalar Unsigned Saturating Extract Narrow - case AArch64::BI__builtin_neon_vqmovnh_u16: - case AArch64::BI__builtin_neon_vqmovns_u32: - case AArch64::BI__builtin_neon_vqmovnd_u64: - Int = Intrinsic::arm_neon_vqmovnu; - s = "vqmovn"; OverloadNarrowInt = true; break; - // Scalar Signed Shift Right (Immediate) - case AArch64::BI__builtin_neon_vshrd_n_s64: - Int = Intrinsic::aarch64_neon_vshrds_n; - s = "vsshr"; OverloadInt = false; break; - // Scalar Unsigned Shift Right (Immediate) - case AArch64::BI__builtin_neon_vshrd_n_u64: - Int = Intrinsic::aarch64_neon_vshrdu_n; - s = "vushr"; OverloadInt = false; break; - // Scalar Signed Rounding Shift Right (Immediate) - case AArch64::BI__builtin_neon_vrshrd_n_s64: - Int = Intrinsic::aarch64_neon_vsrshr; - s = "vsrshr"; OverloadInt = true; break; - // Scalar Unsigned Rounding Shift Right (Immediate) - case AArch64::BI__builtin_neon_vrshrd_n_u64: - Int = Intrinsic::aarch64_neon_vurshr; - s = "vurshr"; OverloadInt = true; break; - // Scalar Signed Shift Right and Accumulate (Immediate) - case AArch64::BI__builtin_neon_vsrad_n_s64: - Int = Intrinsic::aarch64_neon_vsrads_n; - s = "vssra"; OverloadInt = false; break; - // Scalar Unsigned Shift Right and Accumulate (Immediate) - case AArch64::BI__builtin_neon_vsrad_n_u64: - Int = Intrinsic::aarch64_neon_vsradu_n; - s = "vusra"; OverloadInt = false; break; - // Scalar Signed Rounding Shift Right and Accumulate (Immediate) - case AArch64::BI__builtin_neon_vrsrad_n_s64: - Int = Intrinsic::aarch64_neon_vrsrads_n; - s = "vsrsra"; OverloadInt = false; break; - // Scalar Unsigned Rounding Shift Right and Accumulate (Immediate) - case AArch64::BI__builtin_neon_vrsrad_n_u64: - Int = Intrinsic::aarch64_neon_vrsradu_n; - s = "vursra"; OverloadInt = false; break; - // Scalar Signed/Unsigned Shift Left (Immediate) - case AArch64::BI__builtin_neon_vshld_n_s64: - case AArch64::BI__builtin_neon_vshld_n_u64: - Int = Intrinsic::aarch64_neon_vshld_n; - s = "vshl"; OverloadInt = false; break; - // Signed Saturating Shift Left (Immediate) - case AArch64::BI__builtin_neon_vqshlb_n_s8: - case AArch64::BI__builtin_neon_vqshlh_n_s16: - case AArch64::BI__builtin_neon_vqshls_n_s32: - case AArch64::BI__builtin_neon_vqshld_n_s64: - Int = Intrinsic::aarch64_neon_vqshls_n; - s = "vsqshl"; OverloadInt = true; break; - // Unsigned Saturating Shift Left (Immediate) - case AArch64::BI__builtin_neon_vqshlb_n_u8: - case AArch64::BI__builtin_neon_vqshlh_n_u16: - case AArch64::BI__builtin_neon_vqshls_n_u32: - case AArch64::BI__builtin_neon_vqshld_n_u64: - Int = Intrinsic::aarch64_neon_vqshlu_n; - s = "vuqshl"; OverloadInt = true; break; - // Signed Saturating Shift Left Unsigned (Immediate) - case AArch64::BI__builtin_neon_vqshlub_n_s8: - case AArch64::BI__builtin_neon_vqshluh_n_s16: - case AArch64::BI__builtin_neon_vqshlus_n_s32: - case AArch64::BI__builtin_neon_vqshlud_n_s64: - Int = Intrinsic::aarch64_neon_vsqshlu; - s = "vsqshlu"; OverloadInt = true; break; - // Shift Right And Insert (Immediate) - case AArch64::BI__builtin_neon_vsrid_n_s64: - case AArch64::BI__builtin_neon_vsrid_n_u64: - Int = Intrinsic::aarch64_neon_vsri; - s = "vsri"; OverloadInt = true; break; - // Shift Left And Insert (Immediate) - case AArch64::BI__builtin_neon_vslid_n_s64: - case AArch64::BI__builtin_neon_vslid_n_u64: - Int = Intrinsic::aarch64_neon_vsli; - s = "vsli"; OverloadInt = true; break; - // Signed Saturating Shift Right Narrow (Immediate) - case AArch64::BI__builtin_neon_vqshrnh_n_s16: - case AArch64::BI__builtin_neon_vqshrns_n_s32: - case AArch64::BI__builtin_neon_vqshrnd_n_s64: - Int = Intrinsic::aarch64_neon_vsqshrn; - s = "vsqshrn"; OverloadInt = true; break; - // Unsigned Saturating Shift Right Narrow (Immediate) - case AArch64::BI__builtin_neon_vqshrnh_n_u16: - case AArch64::BI__builtin_neon_vqshrns_n_u32: - case AArch64::BI__builtin_neon_vqshrnd_n_u64: - Int = Intrinsic::aarch64_neon_vuqshrn; - s = "vuqshrn"; OverloadInt = true; break; - // Signed Saturating Rounded Shift Right Narrow (Immediate) - case AArch64::BI__builtin_neon_vqrshrnh_n_s16: - case AArch64::BI__builtin_neon_vqrshrns_n_s32: - case AArch64::BI__builtin_neon_vqrshrnd_n_s64: - Int = Intrinsic::aarch64_neon_vsqrshrn; - s = "vsqrshrn"; OverloadInt = true; break; - // Unsigned Saturating Rounded Shift Right Narrow (Immediate) - case AArch64::BI__builtin_neon_vqrshrnh_n_u16: - case AArch64::BI__builtin_neon_vqrshrns_n_u32: - case AArch64::BI__builtin_neon_vqrshrnd_n_u64: - Int = Intrinsic::aarch64_neon_vuqrshrn; - s = "vuqrshrn"; OverloadInt = true; break; - // Signed Saturating Shift Right Unsigned Narrow (Immediate) - case AArch64::BI__builtin_neon_vqshrunh_n_s16: - case AArch64::BI__builtin_neon_vqshruns_n_s32: - case AArch64::BI__builtin_neon_vqshrund_n_s64: - Int = Intrinsic::aarch64_neon_vsqshrun; - s = "vsqshrun"; OverloadInt = true; break; - // Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate) - case AArch64::BI__builtin_neon_vqrshrunh_n_s16: - case AArch64::BI__builtin_neon_vqrshruns_n_s32: - case AArch64::BI__builtin_neon_vqrshrund_n_s64: - Int = Intrinsic::aarch64_neon_vsqrshrun; - s = "vsqrshrun"; OverloadInt = true; break; - // Scalar Signed Fixed-point Convert To Floating-Point (Immediate) - case AArch64::BI__builtin_neon_vcvts_n_f32_s32: - Int = Intrinsic::aarch64_neon_vcvtf32_n_s32; - s = "vcvtf"; OverloadInt = false; break; - case AArch64::BI__builtin_neon_vcvtd_n_f64_s64: - Int = Intrinsic::aarch64_neon_vcvtf64_n_s64; - s = "vcvtf"; OverloadInt = false; break; - // Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate) - case AArch64::BI__builtin_neon_vcvts_n_f32_u32: - Int = Intrinsic::aarch64_neon_vcvtf32_n_u32; - s = "vcvtf"; OverloadInt = false; break; - case AArch64::BI__builtin_neon_vcvtd_n_f64_u64: - Int = Intrinsic::aarch64_neon_vcvtf64_n_u64; - s = "vcvtf"; OverloadInt = false; break; - // Scalar Floating-point Convert To Signed Fixed-point (Immediate) - case AArch64::BI__builtin_neon_vcvts_n_s32_f32: - Int = Intrinsic::aarch64_neon_vcvts_n_s32_f32; - s = "fcvtzs"; OverloadInt = false; break; - case AArch64::BI__builtin_neon_vcvtd_n_s64_f64: - Int = Intrinsic::aarch64_neon_vcvtd_n_s64_f64; - s = "fcvtzs"; OverloadInt = false; break; - // Scalar Floating-point Convert To Unsigned Fixed-point (Immediate) - case AArch64::BI__builtin_neon_vcvts_n_u32_f32: - Int = Intrinsic::aarch64_neon_vcvts_n_u32_f32; - s = "fcvtzu"; OverloadInt = false; break; - case AArch64::BI__builtin_neon_vcvtd_n_u64_f64: - Int = Intrinsic::aarch64_neon_vcvtd_n_u64_f64; - s = "fcvtzu"; OverloadInt = false; break; - } - - if (!Int) - return 0; - - // AArch64 scalar builtin that returns scalar type - // and should be mapped to AArch64 intrinsic that returns - // one-element vector type. - Function *F = 0; - if (AcrossVec) { - // Gen arg type - const Expr *Arg = E->getArg(E->getNumArgs()-1); - llvm::Type *Ty = CGF.ConvertType(Arg->getType()); - llvm::VectorType *VTy = cast<llvm::VectorType>(Ty); - llvm::Type *ETy = VTy->getElementType(); - llvm::VectorType *RTy = llvm::VectorType::get(ETy, 1); - - if (ExtendEle) { - assert(!ETy->isFloatingPointTy()); - RTy = llvm::VectorType::getExtendedElementVectorType(RTy); - } - - llvm::Type *Tys[2] = {RTy, VTy}; - F = CGF.CGM.getIntrinsic(Int, Tys); - assert(E->getNumArgs() == 1); - } else if (OverloadInt) { - // Determine the type of this overloaded AArch64 intrinsic - llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType()); - llvm::VectorType *VTy = llvm::VectorType::get(Ty, 1); - assert(VTy); - - F = CGF.CGM.getIntrinsic(Int, VTy); - } else if (OverloadWideInt || OverloadNarrowInt) { - // Determine the type of this overloaded AArch64 intrinsic - const Expr *Arg = E->getArg(E->getNumArgs()-1); - llvm::Type *Ty = CGF.ConvertType(Arg->getType()); - llvm::VectorType *VTy = llvm::VectorType::get(Ty, 1); - llvm::VectorType *RTy = OverloadWideInt ? - llvm::VectorType::getExtendedElementVectorType(VTy) : - llvm::VectorType::getTruncatedElementVectorType(VTy); - F = CGF.CGM.getIntrinsic(Int, RTy); - } else if (OverloadCmpInt) { - // Determine the types of this overloaded AArch64 intrinsic - SmallVector<llvm::Type *, 3> Tys; - const Expr *Arg = E->getArg(E->getNumArgs()-1); - llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType()); - llvm::VectorType *VTy = llvm::VectorType::get(Ty, 1); - Tys.push_back(VTy); - Ty = CGF.ConvertType(Arg->getType()); - VTy = llvm::VectorType::get(Ty, 1); - Tys.push_back(VTy); - if(IsFpCmpZInt) - VTy = llvm::VectorType::get(CGF.FloatTy, 1); - Tys.push_back(VTy); - - F = CGF.CGM.getIntrinsic(Int, Tys); - } else if (OverloadCvtInt) { - // Determine the types of this overloaded AArch64 intrinsic - SmallVector<llvm::Type *, 2> Tys; - const Expr *Arg = E->getArg(E->getNumArgs()-1); - llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType()); - llvm::VectorType *VTy = llvm::VectorType::get(Ty, 1); - Tys.push_back(VTy); - Ty = CGF.ConvertType(Arg->getType()); - VTy = llvm::VectorType::get(Ty, 1); - Tys.push_back(VTy); - - F = CGF.CGM.getIntrinsic(Int, Tys); - } else - F = CGF.CGM.getIntrinsic(Int); - - Value *Result = CGF.EmitNeonCall(F, Ops, s); - llvm::Type *ResultType = CGF.ConvertType(E->getType()); - // AArch64 intrinsic one-element vector type cast to - // scalar type expected by the builtin - return CGF.Builder.CreateBitCast(Result, ResultType, s); +enum { + AddRetType = (1 << 0), + Add1ArgType = (1 << 1), + Add2ArgTypes = (1 << 2), + + VectorizeRetType = (1 << 3), + VectorizeArgTypes = (1 << 4), + + InventFloatType = (1 << 5), + UnsignedAlts = (1 << 6), + + Use64BitVectors = (1 << 7), + Use128BitVectors = (1 << 8), + + Vectorize1ArgType = Add1ArgType | VectorizeArgTypes, + VectorRet = AddRetType | VectorizeRetType, + VectorRetGetArgs01 = + AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes, + FpCmpzModifiers = + AddRetType | VectorizeRetType | Add1ArgType | InventFloatType +}; + + struct NeonIntrinsicInfo { + unsigned BuiltinID; + unsigned LLVMIntrinsic; + unsigned AltLLVMIntrinsic; + const char *NameHint; + unsigned TypeModifier; + + bool operator<(unsigned RHSBuiltinID) const { + return BuiltinID < RHSBuiltinID; + } +}; + +#define NEONMAP0(NameBase) \ + { NEON::BI__builtin_neon_ ## NameBase, 0, 0, #NameBase, 0 } + +#define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \ + { NEON:: BI__builtin_neon_ ## NameBase, \ + Intrinsic::LLVMIntrinsic, 0, #NameBase, TypeModifier } + +#define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \ + { NEON:: BI__builtin_neon_ ## NameBase, \ + Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \ + #NameBase, TypeModifier } + +static NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = { + NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), + NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), + NEONMAP1(vabs_v, arm_neon_vabs, 0), + NEONMAP1(vabsq_v, arm_neon_vabs, 0), + NEONMAP0(vaddhn_v), + NEONMAP1(vaesdq_v, arm_neon_aesd, 0), + NEONMAP1(vaeseq_v, arm_neon_aese, 0), + NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0), + NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0), + NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType), + NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType), + NEONMAP1(vcage_v, arm_neon_vacge, 0), + NEONMAP1(vcageq_v, arm_neon_vacge, 0), + NEONMAP1(vcagt_v, arm_neon_vacgt, 0), + NEONMAP1(vcagtq_v, arm_neon_vacgt, 0), + NEONMAP1(vcale_v, arm_neon_vacge, 0), + NEONMAP1(vcaleq_v, arm_neon_vacge, 0), + NEONMAP1(vcalt_v, arm_neon_vacgt, 0), + NEONMAP1(vcaltq_v, arm_neon_vacgt, 0), + NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType), + NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType), + NEONMAP1(vclz_v, ctlz, Add1ArgType), + NEONMAP1(vclzq_v, ctlz, Add1ArgType), + NEONMAP1(vcnt_v, ctpop, Add1ArgType), + NEONMAP1(vcntq_v, ctpop, Add1ArgType), + NEONMAP1(vcvt_f16_v, arm_neon_vcvtfp2hf, 0), + NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0), + NEONMAP0(vcvt_f32_v), + NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0), + NEONMAP0(vcvt_s32_v), + NEONMAP0(vcvt_s64_v), + NEONMAP0(vcvt_u32_v), + NEONMAP0(vcvt_u64_v), + NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0), + NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0), + NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0), + NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0), + NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0), + NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0), + NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0), + NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0), + NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0), + NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0), + NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0), + NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0), + NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0), + NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0), + NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0), + NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0), + NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0), + NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0), + NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0), + NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0), + NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0), + NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0), + NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0), + NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0), + NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0), + NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0), + NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0), + NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0), + NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0), + NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0), + NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0), + NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0), + NEONMAP0(vcvtq_f32_v), + NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0), + NEONMAP0(vcvtq_s32_v), + NEONMAP0(vcvtq_s64_v), + NEONMAP0(vcvtq_u32_v), + NEONMAP0(vcvtq_u64_v), + NEONMAP0(vext_v), + NEONMAP0(vextq_v), + NEONMAP0(vfma_v), + NEONMAP0(vfmaq_v), + NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts), + NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts), + NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts), + NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts), + NEONMAP0(vld1_dup_v), + NEONMAP1(vld1_v, arm_neon_vld1, 0), + NEONMAP0(vld1q_dup_v), + NEONMAP1(vld1q_v, arm_neon_vld1, 0), + NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0), + NEONMAP1(vld2_v, arm_neon_vld2, 0), + NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0), + NEONMAP1(vld2q_v, arm_neon_vld2, 0), + NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0), + NEONMAP1(vld3_v, arm_neon_vld3, 0), + NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0), + NEONMAP1(vld3q_v, arm_neon_vld3, 0), + NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0), + NEONMAP1(vld4_v, arm_neon_vld4, 0), + NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0), + NEONMAP1(vld4q_v, arm_neon_vld4, 0), + NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts), + NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts), + NEONMAP0(vmovl_v), + NEONMAP0(vmovn_v), + NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType), + NEONMAP0(vmull_v), + NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType), + NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts), + NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts), + NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType), + NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts), + NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts), + NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType), + NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts), + NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts), + NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType), + NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType), + NEONMAP2(vqadd_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts), + NEONMAP2(vqaddq_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts), + NEONMAP2(vqdmlal_v, arm_neon_vqdmull, arm_neon_vqadds, 0), + NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, arm_neon_vqsubs, 0), + NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType), + NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType), + NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType), + NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts), + NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType), + NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType), + NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType), + NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType), + NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType), + NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts), + NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts), + NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts), + NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts), + NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts), + NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts), + NEONMAP2(vqsub_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts), + NEONMAP2(vqsubq_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts), + NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType), + NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0), + NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0), + NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType), + NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType), + NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts), + NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts), + NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts), + NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts), + NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0), + NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0), + NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType), + NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType), + NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType), + NEONMAP1(vsha1su0q_v, arm_neon_sha1su0, 0), + NEONMAP1(vsha1su1q_v, arm_neon_sha1su1, 0), + NEONMAP1(vsha256h2q_v, arm_neon_sha256h2, 0), + NEONMAP1(vsha256hq_v, arm_neon_sha256h, 0), + NEONMAP1(vsha256su0q_v, arm_neon_sha256su0, 0), + NEONMAP1(vsha256su1q_v, arm_neon_sha256su1, 0), + NEONMAP0(vshl_n_v), + NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts), + NEONMAP0(vshll_n_v), + NEONMAP0(vshlq_n_v), + NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts), + NEONMAP0(vshr_n_v), + NEONMAP0(vshrn_n_v), + NEONMAP0(vshrq_n_v), + NEONMAP1(vst1_v, arm_neon_vst1, 0), + NEONMAP1(vst1q_v, arm_neon_vst1, 0), + NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0), + NEONMAP1(vst2_v, arm_neon_vst2, 0), + NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0), + NEONMAP1(vst2q_v, arm_neon_vst2, 0), + NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0), + NEONMAP1(vst3_v, arm_neon_vst3, 0), + NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0), + NEONMAP1(vst3q_v, arm_neon_vst3, 0), + NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0), + NEONMAP1(vst4_v, arm_neon_vst4, 0), + NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0), + NEONMAP1(vst4q_v, arm_neon_vst4, 0), + NEONMAP0(vsubhn_v), + NEONMAP0(vtrn_v), + NEONMAP0(vtrnq_v), + NEONMAP0(vtst_v), + NEONMAP0(vtstq_v), + NEONMAP0(vuzp_v), + NEONMAP0(vuzpq_v), + NEONMAP0(vzip_v), + NEONMAP0(vzipq_v) +}; + +static NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = { + NEONMAP1(vabs_v, aarch64_neon_abs, 0), + NEONMAP1(vabsq_v, aarch64_neon_abs, 0), + NEONMAP0(vaddhn_v), + NEONMAP1(vaesdq_v, aarch64_crypto_aesd, 0), + NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0), + NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0), + NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0), + NEONMAP1(vcage_v, aarch64_neon_facge, 0), + NEONMAP1(vcageq_v, aarch64_neon_facge, 0), + NEONMAP1(vcagt_v, aarch64_neon_facgt, 0), + NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0), + NEONMAP1(vcale_v, aarch64_neon_facge, 0), + NEONMAP1(vcaleq_v, aarch64_neon_facge, 0), + NEONMAP1(vcalt_v, aarch64_neon_facgt, 0), + NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0), + NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType), + NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType), + NEONMAP1(vclz_v, ctlz, Add1ArgType), + NEONMAP1(vclzq_v, ctlz, Add1ArgType), + NEONMAP1(vcnt_v, ctpop, Add1ArgType), + NEONMAP1(vcntq_v, ctpop, Add1ArgType), + NEONMAP1(vcvt_f16_v, aarch64_neon_vcvtfp2hf, 0), + NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0), + NEONMAP0(vcvt_f32_v), + NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), + NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), + NEONMAP0(vcvtq_f32_v), + NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), + NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), + NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0), + NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType), + NEONMAP0(vext_v), + NEONMAP0(vextq_v), + NEONMAP0(vfma_v), + NEONMAP0(vfmaq_v), + NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts), + NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts), + NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts), + NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts), + NEONMAP0(vmovl_v), + NEONMAP0(vmovn_v), + NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType), + NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType), + NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType), + NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts), + NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts), + NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType), + NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType), + NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType), + NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts), + NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts), + NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0), + NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0), + NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType), + NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType), + NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType), + NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts), + NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType), + NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType), + NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType), + NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType), + NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType), + NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts), + NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts), + NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts), + NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts), + NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts), + NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts), + NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts), + NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts), + NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType), + NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0), + NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0), + NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType), + NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType), + NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts), + NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts), + NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts), + NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts), + NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0), + NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0), + NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType), + NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType), + NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType), + NEONMAP1(vsha1su0q_v, aarch64_crypto_sha1su0, 0), + NEONMAP1(vsha1su1q_v, aarch64_crypto_sha1su1, 0), + NEONMAP1(vsha256h2q_v, aarch64_crypto_sha256h2, 0), + NEONMAP1(vsha256hq_v, aarch64_crypto_sha256h, 0), + NEONMAP1(vsha256su0q_v, aarch64_crypto_sha256su0, 0), + NEONMAP1(vsha256su1q_v, aarch64_crypto_sha256su1, 0), + NEONMAP0(vshl_n_v), + NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts), + NEONMAP0(vshll_n_v), + NEONMAP0(vshlq_n_v), + NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts), + NEONMAP0(vshr_n_v), + NEONMAP0(vshrn_n_v), + NEONMAP0(vshrq_n_v), + NEONMAP0(vsubhn_v), + NEONMAP0(vtst_v), + NEONMAP0(vtstq_v), +}; + +static NeonIntrinsicInfo AArch64SISDIntrinsicMap[] = { + NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType), + NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType), + NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType), + NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType), + NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType), + NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType), + NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType), + NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType), + NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType), + NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType), + NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType), + NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType), + NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType), + NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType), + NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType), + NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType), + NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType), + NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType), + NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType), + NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType), + NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType), + NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType), + NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType), + NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType), + NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType), + NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType), + NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType), + NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType), + NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), + NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType), + NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), + NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType), + NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), + NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType), + NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), + NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType), + NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), + NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType), + NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), + NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), + NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0), + NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), + NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), + NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), + NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType), + NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType), + NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType), + NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType), + NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType), + NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType), + NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType), + NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType), + NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType), + NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType), + NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType), + NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType), + NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType), + NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType), + NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType), + NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType), + NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType), + NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0), + NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType), + NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType), + NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType), + NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType), + NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), + NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), + NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType), + NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType), + NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType), + NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType), + NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType), + NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType), + NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType), + NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType), + NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType), + NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType), + NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType), + NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType), + NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType), + NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors), + NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0), + NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType), + NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType), + NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors), + NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors), + NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors), + NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors), + NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType), + NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors), + NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors), + NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType), + NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType), + NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType), + NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType), + NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType), + NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType), + NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType), + NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType), + NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType), + NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors), + NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors), + NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors), + NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors), + NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType), + NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors), + NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors), + NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType), + NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType), + NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType), + NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType), + NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType), + NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType), + NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType), + NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType), + NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType), + NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors), + NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors), + NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors), + NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors), + NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType), + NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors), + NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors), + NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType), + NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType), + NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType), + NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType), + NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType), + NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType), + NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType), + NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType), + NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType), + NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType), + NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType), + NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType), + NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType), + NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType), + NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0), + NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0), + NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0), + NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0), + NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType), + NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType), + NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType), + NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType), + NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType), + NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType), + NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType), + NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType), + NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType), + NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors), + NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType), +}; + +#undef NEONMAP0 +#undef NEONMAP1 +#undef NEONMAP2 + +static bool NEONSIMDIntrinsicsProvenSorted = false; + +static bool AArch64SIMDIntrinsicsProvenSorted = false; +static bool AArch64SISDIntrinsicsProvenSorted = false; + + +static const NeonIntrinsicInfo * +findNeonIntrinsicInMap(ArrayRef<NeonIntrinsicInfo> IntrinsicMap, + unsigned BuiltinID, bool &MapProvenSorted) { + +#ifndef NDEBUG + if (!MapProvenSorted) { + // FIXME: use std::is_sorted once C++11 is allowed + for (unsigned i = 0; i < IntrinsicMap.size() - 1; ++i) + assert(IntrinsicMap[i].BuiltinID <= IntrinsicMap[i + 1].BuiltinID); + MapProvenSorted = true; + } +#endif + + const NeonIntrinsicInfo *Builtin = + std::lower_bound(IntrinsicMap.begin(), IntrinsicMap.end(), BuiltinID); + + if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID) + return Builtin; + + return nullptr; } -Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr( - Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp, - const CmpInst::Predicate Ip, const Twine &Name) { - llvm::Type *OTy = ((llvm::User *)Op)->getOperand(0)->getType(); - if (OTy->isPointerTy()) - OTy = Ty; - Op = Builder.CreateBitCast(Op, OTy); - if (((llvm::VectorType *)OTy)->getElementType()->isFloatingPointTy()) { - Op = Builder.CreateFCmp(Fp, Op, ConstantAggregateZero::get(OTy)); - } else { - Op = Builder.CreateICmp(Ip, Op, ConstantAggregateZero::get(OTy)); - } - return Builder.CreateZExt(Op, Ty, Name); -} +Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID, + unsigned Modifier, + llvm::Type *ArgType, + const CallExpr *E) { + int VectorSize = 0; + if (Modifier & Use64BitVectors) + VectorSize = 64; + else if (Modifier & Use128BitVectors) + VectorSize = 128; -static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops, - Value *ExtOp, Value *IndexOp, - llvm::Type *ResTy, unsigned IntID, - const char *Name) { - SmallVector<Value *, 2> TblOps; - if (ExtOp) - TblOps.push_back(ExtOp); + // Return type. + SmallVector<llvm::Type *, 3> Tys; + if (Modifier & AddRetType) { + llvm::Type *Ty = ConvertType(E->getCallReturnType()); + if (Modifier & VectorizeRetType) + Ty = llvm::VectorType::get( + Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1); - // Build a vector containing sequential number like (0, 1, 2, ..., 15) - SmallVector<Constant*, 16> Indices; - llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType()); - for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) { - Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i)); - Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i+1)); + Tys.push_back(Ty); } - Value *SV = llvm::ConstantVector::get(Indices); - int PairPos = 0, End = Ops.size() - 1; - while (PairPos < End) { - TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos], - Ops[PairPos+1], SV, Name)); - PairPos += 2; + // Arguments. + if (Modifier & VectorizeArgTypes) { + int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1; + ArgType = llvm::VectorType::get(ArgType, Elts); } - // If there's an odd number of 64-bit lookup table, fill the high 64-bit - // of the 128-bit lookup table with zero. - if (PairPos == End) { - Value *ZeroTbl = ConstantAggregateZero::get(TblTy); - TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos], - ZeroTbl, SV, Name)); - } + if (Modifier & (Add1ArgType | Add2ArgTypes)) + Tys.push_back(ArgType); - TblTy = llvm::VectorType::get(TblTy->getElementType(), - 2*TblTy->getNumElements()); - llvm::Type *Tys[2] = { ResTy, TblTy }; + if (Modifier & Add2ArgTypes) + Tys.push_back(ArgType); - Function *TblF; - TblOps.push_back(IndexOp); - TblF = CGF.CGM.getIntrinsic(IntID, Tys); - - return CGF.EmitNeonCall(TblF, TblOps, Name); + if (Modifier & InventFloatType) + Tys.push_back(FloatTy); + + return CGM.getIntrinsic(IntrinsicID, Tys); } -static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, - unsigned BuiltinID, - const CallExpr *E) { - unsigned int Int = 0; - const char *s = NULL; +static Value *EmitCommonNeonSISDBuiltinExpr(CodeGenFunction &CGF, + const NeonIntrinsicInfo &SISDInfo, + SmallVectorImpl<Value *> &Ops, + const CallExpr *E) { + unsigned BuiltinID = SISDInfo.BuiltinID; + unsigned int Int = SISDInfo.LLVMIntrinsic; + unsigned Modifier = SISDInfo.TypeModifier; + const char *s = SISDInfo.NameHint; - unsigned TblPos; switch (BuiltinID) { - default: - return 0; - case AArch64::BI__builtin_neon_vtbl1_v: - case AArch64::BI__builtin_neon_vqtbl1_v: - case AArch64::BI__builtin_neon_vqtbl1q_v: - case AArch64::BI__builtin_neon_vtbl2_v: - case AArch64::BI__builtin_neon_vqtbl2_v: - case AArch64::BI__builtin_neon_vqtbl2q_v: - case AArch64::BI__builtin_neon_vtbl3_v: - case AArch64::BI__builtin_neon_vqtbl3_v: - case AArch64::BI__builtin_neon_vqtbl3q_v: - case AArch64::BI__builtin_neon_vtbl4_v: - case AArch64::BI__builtin_neon_vqtbl4_v: - case AArch64::BI__builtin_neon_vqtbl4q_v: - TblPos = 0; - break; - case AArch64::BI__builtin_neon_vtbx1_v: - case AArch64::BI__builtin_neon_vqtbx1_v: - case AArch64::BI__builtin_neon_vqtbx1q_v: - case AArch64::BI__builtin_neon_vtbx2_v: - case AArch64::BI__builtin_neon_vqtbx2_v: - case AArch64::BI__builtin_neon_vqtbx2q_v: - case AArch64::BI__builtin_neon_vtbx3_v: - case AArch64::BI__builtin_neon_vqtbx3_v: - case AArch64::BI__builtin_neon_vqtbx3q_v: - case AArch64::BI__builtin_neon_vtbx4_v: - case AArch64::BI__builtin_neon_vqtbx4_v: - case AArch64::BI__builtin_neon_vqtbx4q_v: - TblPos = 1; + case NEON::BI__builtin_neon_vcled_s64: + case NEON::BI__builtin_neon_vcled_u64: + case NEON::BI__builtin_neon_vcles_f32: + case NEON::BI__builtin_neon_vcled_f64: + case NEON::BI__builtin_neon_vcltd_s64: + case NEON::BI__builtin_neon_vcltd_u64: + case NEON::BI__builtin_neon_vclts_f32: + case NEON::BI__builtin_neon_vcltd_f64: + case NEON::BI__builtin_neon_vcales_f32: + case NEON::BI__builtin_neon_vcaled_f64: + case NEON::BI__builtin_neon_vcalts_f32: + case NEON::BI__builtin_neon_vcaltd_f64: + // Only one direction of comparisons actually exist, cmle is actually a cmge + // with swapped operands. The table gives us the right intrinsic but we + // still need to do the swap. + std::swap(Ops[0], Ops[1]); break; } - assert(E->getNumArgs() >= 3); - - // Get the last argument, which specifies the vector type. - llvm::APSInt Result; - const Expr *Arg = E->getArg(E->getNumArgs() - 1); - if (!Arg->isIntegerConstantExpr(Result, CGF.getContext())) - return 0; - - // Determine the type of this overloaded NEON intrinsic. - NeonTypeFlags Type(Result.getZExtValue()); - llvm::VectorType *VTy = GetNeonType(&CGF, Type); - llvm::Type *Ty = VTy; - if (!Ty) - return 0; - - SmallVector<Value *, 4> Ops; - for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) { - Ops.push_back(CGF.EmitScalarExpr(E->getArg(i))); - } - - Arg = E->getArg(TblPos); - llvm::Type *TblTy = CGF.ConvertType(Arg->getType()); - llvm::VectorType *VTblTy = cast<llvm::VectorType>(TblTy); - llvm::Type *Tys[2] = { Ty, VTblTy }; - unsigned nElts = VTy->getNumElements(); + assert(Int && "Generic code assumes a valid intrinsic"); - // AArch64 scalar builtins are not overloaded, they do not have an extra - // argument that specifies the vector type, need to handle each case. - SmallVector<Value *, 2> TblOps; - switch (BuiltinID) { - case AArch64::BI__builtin_neon_vtbl1_v: { - TblOps.push_back(Ops[0]); - return packTBLDVectorList(CGF, TblOps, 0, Ops[1], Ty, - Intrinsic::aarch64_neon_vtbl1, "vtbl1"); - } - case AArch64::BI__builtin_neon_vtbl2_v: { - TblOps.push_back(Ops[0]); - TblOps.push_back(Ops[1]); - return packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty, - Intrinsic::aarch64_neon_vtbl1, "vtbl1"); - } - case AArch64::BI__builtin_neon_vtbl3_v: { - TblOps.push_back(Ops[0]); - TblOps.push_back(Ops[1]); - TblOps.push_back(Ops[2]); - return packTBLDVectorList(CGF, TblOps, 0, Ops[3], Ty, - Intrinsic::aarch64_neon_vtbl2, "vtbl2"); - } - case AArch64::BI__builtin_neon_vtbl4_v: { - TblOps.push_back(Ops[0]); - TblOps.push_back(Ops[1]); - TblOps.push_back(Ops[2]); - TblOps.push_back(Ops[3]); - return packTBLDVectorList(CGF, TblOps, 0, Ops[4], Ty, - Intrinsic::aarch64_neon_vtbl2, "vtbl2"); - } - case AArch64::BI__builtin_neon_vtbx1_v: { - TblOps.push_back(Ops[1]); - Value *TblRes = packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty, - Intrinsic::aarch64_neon_vtbl1, "vtbl1"); - - llvm::Constant *Eight = ConstantInt::get(VTy->getElementType(), 8); - Value* EightV = llvm::ConstantVector::getSplat(nElts, Eight); - Value *CmpRes = CGF.Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV); - CmpRes = CGF.Builder.CreateSExt(CmpRes, Ty); + // Determine the type(s) of this overloaded AArch64 intrinsic. + const Expr *Arg = E->getArg(0); + llvm::Type *ArgTy = CGF.ConvertType(Arg->getType()); + Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E); - SmallVector<Value *, 4> BslOps; - BslOps.push_back(CmpRes); - BslOps.push_back(Ops[0]); - BslOps.push_back(TblRes); - Function *BslF = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vbsl, Ty); - return CGF.EmitNeonCall(BslF, BslOps, "vbsl"); - } - case AArch64::BI__builtin_neon_vtbx2_v: { - TblOps.push_back(Ops[1]); - TblOps.push_back(Ops[2]); - return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[3], Ty, - Intrinsic::aarch64_neon_vtbx1, "vtbx1"); - } - case AArch64::BI__builtin_neon_vtbx3_v: { - TblOps.push_back(Ops[1]); - TblOps.push_back(Ops[2]); - TblOps.push_back(Ops[3]); - Value *TblRes = packTBLDVectorList(CGF, TblOps, 0, Ops[4], Ty, - Intrinsic::aarch64_neon_vtbl2, "vtbl2"); + int j = 0; + ConstantInt *C0 = ConstantInt::get(CGF.SizeTy, 0); + for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai, ++j) { + llvm::Type *ArgTy = ai->getType(); + if (Ops[j]->getType()->getPrimitiveSizeInBits() == + ArgTy->getPrimitiveSizeInBits()) + continue; - llvm::Constant *TwentyFour = ConstantInt::get(VTy->getElementType(), 24); - Value* TwentyFourV = llvm::ConstantVector::getSplat(nElts, TwentyFour); - Value *CmpRes = CGF.Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4], - TwentyFourV); - CmpRes = CGF.Builder.CreateSExt(CmpRes, Ty); - - SmallVector<Value *, 4> BslOps; - BslOps.push_back(CmpRes); - BslOps.push_back(Ops[0]); - BslOps.push_back(TblRes); - Function *BslF = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vbsl, Ty); - return CGF.EmitNeonCall(BslF, BslOps, "vbsl"); - } - case AArch64::BI__builtin_neon_vtbx4_v: { - TblOps.push_back(Ops[1]); - TblOps.push_back(Ops[2]); - TblOps.push_back(Ops[3]); - TblOps.push_back(Ops[4]); - return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[5], Ty, - Intrinsic::aarch64_neon_vtbx2, "vtbx2"); - } - case AArch64::BI__builtin_neon_vqtbl1_v: - case AArch64::BI__builtin_neon_vqtbl1q_v: - Int = Intrinsic::aarch64_neon_vtbl1; s = "vtbl1"; break; - case AArch64::BI__builtin_neon_vqtbl2_v: - case AArch64::BI__builtin_neon_vqtbl2q_v: { - Int = Intrinsic::aarch64_neon_vtbl2; s = "vtbl2"; break; - case AArch64::BI__builtin_neon_vqtbl3_v: - case AArch64::BI__builtin_neon_vqtbl3q_v: - Int = Intrinsic::aarch64_neon_vtbl3; s = "vtbl3"; break; - case AArch64::BI__builtin_neon_vqtbl4_v: - case AArch64::BI__builtin_neon_vqtbl4q_v: - Int = Intrinsic::aarch64_neon_vtbl4; s = "vtbl4"; break; - case AArch64::BI__builtin_neon_vqtbx1_v: - case AArch64::BI__builtin_neon_vqtbx1q_v: - Int = Intrinsic::aarch64_neon_vtbx1; s = "vtbx1"; break; - case AArch64::BI__builtin_neon_vqtbx2_v: - case AArch64::BI__builtin_neon_vqtbx2q_v: - Int = Intrinsic::aarch64_neon_vtbx2; s = "vtbx2"; break; - case AArch64::BI__builtin_neon_vqtbx3_v: - case AArch64::BI__builtin_neon_vqtbx3q_v: - Int = Intrinsic::aarch64_neon_vtbx3; s = "vtbx3"; break; - case AArch64::BI__builtin_neon_vqtbx4_v: - case AArch64::BI__builtin_neon_vqtbx4q_v: - Int = Intrinsic::aarch64_neon_vtbx4; s = "vtbx4"; break; - } + assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy()); + // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate + // it before inserting. + Ops[j] = + CGF.Builder.CreateTruncOrBitCast(Ops[j], ArgTy->getVectorElementType()); + Ops[j] = + CGF.Builder.CreateInsertElement(UndefValue::get(ArgTy), Ops[j], C0); } - if (!Int) - return 0; + Value *Result = CGF.EmitNeonCall(F, Ops, s); + llvm::Type *ResultType = CGF.ConvertType(E->getType()); + if (ResultType->getPrimitiveSizeInBits() < + Result->getType()->getPrimitiveSizeInBits()) + return CGF.Builder.CreateExtractElement(Result, C0); - Function *F = CGF.CGM.getIntrinsic(Int, Tys); - return CGF.EmitNeonCall(F, Ops, s); + return CGF.Builder.CreateBitCast(Result, ResultType, s); } -Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, - const CallExpr *E) { - // Process AArch64 scalar builtins - if (Value *Result = EmitAArch64ScalarBuiltinExpr(*this, BuiltinID, E)) - return Result; - - // Process AArch64 table lookup builtins - if (Value *Result = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E)) - return Result; - - if (BuiltinID == AArch64::BI__clear_cache) { - assert(E->getNumArgs() == 2 && - "Variadic __clear_cache slipped through on AArch64"); - - const FunctionDecl *FD = E->getDirectCallee(); - SmallVector<Value *, 2> Ops; - for (unsigned i = 0; i < E->getNumArgs(); i++) - Ops.push_back(EmitScalarExpr(E->getArg(i))); - llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType()); - llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty); - StringRef Name = FD->getName(); - return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops); - } - - SmallVector<Value *, 4> Ops; - llvm::Value *Align = 0; // Alignment for load/store - for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) { - if (i == 0) { - switch (BuiltinID) { - case AArch64::BI__builtin_neon_vst1_x2_v: - case AArch64::BI__builtin_neon_vst1q_x2_v: - case AArch64::BI__builtin_neon_vst1_x3_v: - case AArch64::BI__builtin_neon_vst1q_x3_v: - case AArch64::BI__builtin_neon_vst1_x4_v: - case AArch64::BI__builtin_neon_vst1q_x4_v: - // Handle ld1/st1 lane in this function a little different from ARM. - case AArch64::BI__builtin_neon_vld1_lane_v: - case AArch64::BI__builtin_neon_vld1q_lane_v: - case AArch64::BI__builtin_neon_vst1_lane_v: - case AArch64::BI__builtin_neon_vst1q_lane_v: - // Get the alignment for the argument in addition to the value; - // we'll use it later. - std::pair<llvm::Value *, unsigned> Src = - EmitPointerWithAlignment(E->getArg(0)); - Ops.push_back(Src.first); - Align = Builder.getInt32(Src.second); - continue; - } - } - if (i == 1) { - switch (BuiltinID) { - case AArch64::BI__builtin_neon_vld1_x2_v: - case AArch64::BI__builtin_neon_vld1q_x2_v: - case AArch64::BI__builtin_neon_vld1_x3_v: - case AArch64::BI__builtin_neon_vld1q_x3_v: - case AArch64::BI__builtin_neon_vld1_x4_v: - case AArch64::BI__builtin_neon_vld1q_x4_v: - // Handle ld1/st1 dup lane in this function a little different from ARM. - case AArch64::BI__builtin_neon_vld2_dup_v: - case AArch64::BI__builtin_neon_vld2q_dup_v: - case AArch64::BI__builtin_neon_vld3_dup_v: - case AArch64::BI__builtin_neon_vld3q_dup_v: - case AArch64::BI__builtin_neon_vld4_dup_v: - case AArch64::BI__builtin_neon_vld4q_dup_v: - case AArch64::BI__builtin_neon_vld2_lane_v: - case AArch64::BI__builtin_neon_vld2q_lane_v: - // Get the alignment for the argument in addition to the value; - // we'll use it later. - std::pair<llvm::Value *, unsigned> Src = - EmitPointerWithAlignment(E->getArg(1)); - Ops.push_back(Src.first); - Align = Builder.getInt32(Src.second); - continue; - } - } - Ops.push_back(EmitScalarExpr(E->getArg(i))); - } - +Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( + unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic, + const char *NameHint, unsigned Modifier, const CallExpr *E, + SmallVectorImpl<llvm::Value *> &Ops, llvm::Value *Align) { // Get the last argument, which specifies the vector type. - llvm::APSInt Result; + llvm::APSInt NeonTypeConst; const Expr *Arg = E->getArg(E->getNumArgs() - 1); - if (!Arg->isIntegerConstantExpr(Result, getContext())) - return 0; + if (!Arg->isIntegerConstantExpr(NeonTypeConst, getContext())) + return nullptr; // Determine the type of this overloaded NEON intrinsic. - NeonTypeFlags Type(Result.getZExtValue()); - bool usgn = Type.isUnsigned(); - bool quad = Type.isQuad(); + NeonTypeFlags Type(NeonTypeConst.getZExtValue()); + bool Usgn = Type.isUnsigned(); + bool Quad = Type.isQuad(); llvm::VectorType *VTy = GetNeonType(this, Type); llvm::Type *Ty = VTy; if (!Ty) - return 0; + return nullptr; + + unsigned Int = LLVMIntrinsic; + if ((Modifier & UnsignedAlts) && !Usgn) + Int = AltLLVMIntrinsic; - unsigned Int; switch (BuiltinID) { - default: - return 0; - - // AArch64 builtins mapping to legacy ARM v7 builtins. - // FIXME: the mapped builtins listed correspond to what has been tested - // in aarch64-neon-intrinsics.c so far. - case AArch64::BI__builtin_neon_vuzp_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vuzp_v, E); - case AArch64::BI__builtin_neon_vuzpq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vuzpq_v, E); - case AArch64::BI__builtin_neon_vzip_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vzip_v, E); - case AArch64::BI__builtin_neon_vzipq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vzipq_v, E); - case AArch64::BI__builtin_neon_vtrn_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtrn_v, E); - case AArch64::BI__builtin_neon_vtrnq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtrnq_v, E); - case AArch64::BI__builtin_neon_vext_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vext_v, E); - case AArch64::BI__builtin_neon_vextq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vextq_v, E); - case AArch64::BI__builtin_neon_vmul_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmul_v, E); - case AArch64::BI__builtin_neon_vmulq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmulq_v, E); - case AArch64::BI__builtin_neon_vabd_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vabd_v, E); - case AArch64::BI__builtin_neon_vabdq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vabdq_v, E); - case AArch64::BI__builtin_neon_vfma_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vfma_v, E); - case AArch64::BI__builtin_neon_vfmaq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vfmaq_v, E); - case AArch64::BI__builtin_neon_vbsl_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vbsl_v, E); - case AArch64::BI__builtin_neon_vbslq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vbslq_v, E); - case AArch64::BI__builtin_neon_vrsqrts_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsqrts_v, E); - case AArch64::BI__builtin_neon_vrsqrtsq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsqrtsq_v, E); - case AArch64::BI__builtin_neon_vrecps_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrecps_v, E); - case AArch64::BI__builtin_neon_vrecpsq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrecpsq_v, E); - case AArch64::BI__builtin_neon_vcale_v: - if (VTy->getVectorNumElements() == 1) { - std::swap(Ops[0], Ops[1]); - } else { - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcale_v, E); - } - case AArch64::BI__builtin_neon_vcage_v: - if (VTy->getVectorNumElements() == 1) { - // Determine the types of this overloaded AArch64 intrinsic - SmallVector<llvm::Type *, 3> Tys; - Tys.push_back(VTy); - VTy = llvm::VectorType::get(DoubleTy, 1); - Tys.push_back(VTy); - Tys.push_back(VTy); - Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vcage, Tys); - return EmitNeonCall(F, Ops, "vcage"); - } - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcage_v, E); - case AArch64::BI__builtin_neon_vcaleq_v: - std::swap(Ops[0], Ops[1]); - case AArch64::BI__builtin_neon_vcageq_v: { - Function *F; - if (VTy->getElementType()->isIntegerTy(64)) - F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vacgeq); - else - F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgeq); - return EmitNeonCall(F, Ops, "vcage"); + default: break; + case NEON::BI__builtin_neon_vabs_v: + case NEON::BI__builtin_neon_vabsq_v: + if (VTy->getElementType()->isFloatingPointTy()) + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs"); + return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs"); + case NEON::BI__builtin_neon_vaddhn_v: { + llvm::VectorType *SrcTy = + llvm::VectorType::getExtendedElementVectorType(VTy); + + // %sum = add <4 x i32> %lhs, %rhs + Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); + Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy); + Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn"); + + // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> + Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(), + SrcTy->getScalarSizeInBits() / 2); + ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt); + Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn"); + + // %res = trunc <4 x i32> %high to <4 x i16> + return Builder.CreateTrunc(Ops[0], VTy, "vaddhn"); } - case AArch64::BI__builtin_neon_vcalt_v: - if (VTy->getVectorNumElements() == 1) { - std::swap(Ops[0], Ops[1]); - } else { - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcalt_v, E); - } - case AArch64::BI__builtin_neon_vcagt_v: - if (VTy->getVectorNumElements() == 1) { - // Determine the types of this overloaded AArch64 intrinsic - SmallVector<llvm::Type *, 3> Tys; - Tys.push_back(VTy); - VTy = llvm::VectorType::get(DoubleTy, 1); - Tys.push_back(VTy); - Tys.push_back(VTy); - Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vcagt, Tys); - return EmitNeonCall(F, Ops, "vcagt"); - } - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcagt_v, E); - case AArch64::BI__builtin_neon_vcaltq_v: + case NEON::BI__builtin_neon_vcale_v: + case NEON::BI__builtin_neon_vcaleq_v: + case NEON::BI__builtin_neon_vcalt_v: + case NEON::BI__builtin_neon_vcaltq_v: std::swap(Ops[0], Ops[1]); - case AArch64::BI__builtin_neon_vcagtq_v: { - Function *F; - if (VTy->getElementType()->isIntegerTy(64)) - F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vacgtq); - else - F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgtq); - return EmitNeonCall(F, Ops, "vcagt"); - } - case AArch64::BI__builtin_neon_vtst_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtst_v, E); - case AArch64::BI__builtin_neon_vtstq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtstq_v, E); - case AArch64::BI__builtin_neon_vhadd_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhadd_v, E); - case AArch64::BI__builtin_neon_vhaddq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhaddq_v, E); - case AArch64::BI__builtin_neon_vhsub_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhsub_v, E); - case AArch64::BI__builtin_neon_vhsubq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhsubq_v, E); - case AArch64::BI__builtin_neon_vrhadd_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrhadd_v, E); - case AArch64::BI__builtin_neon_vrhaddq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrhaddq_v, E); - case AArch64::BI__builtin_neon_vqadd_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqadd_v, E); - case AArch64::BI__builtin_neon_vqaddq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqaddq_v, E); - case AArch64::BI__builtin_neon_vqsub_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqsub_v, E); - case AArch64::BI__builtin_neon_vqsubq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqsubq_v, E); - case AArch64::BI__builtin_neon_vshl_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshl_v, E); - case AArch64::BI__builtin_neon_vshlq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshlq_v, E); - case AArch64::BI__builtin_neon_vqshl_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshl_v, E); - case AArch64::BI__builtin_neon_vqshlq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshlq_v, E); - case AArch64::BI__builtin_neon_vrshl_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrshl_v, E); - case AArch64::BI__builtin_neon_vrshlq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrshlq_v, E); - case AArch64::BI__builtin_neon_vqrshl_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrshl_v, E); - case AArch64::BI__builtin_neon_vqrshlq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrshlq_v, E); - case AArch64::BI__builtin_neon_vaddhn_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vaddhn_v, E); - case AArch64::BI__builtin_neon_vraddhn_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vraddhn_v, E); - case AArch64::BI__builtin_neon_vsubhn_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vsubhn_v, E); - case AArch64::BI__builtin_neon_vrsubhn_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsubhn_v, E); - case AArch64::BI__builtin_neon_vmull_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmull_v, E); - case AArch64::BI__builtin_neon_vqdmull_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmull_v, E); - case AArch64::BI__builtin_neon_vqdmlal_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmlal_v, E); - case AArch64::BI__builtin_neon_vqdmlsl_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmlsl_v, E); - case AArch64::BI__builtin_neon_vmax_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmax_v, E); - case AArch64::BI__builtin_neon_vmaxq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmaxq_v, E); - case AArch64::BI__builtin_neon_vmin_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmin_v, E); - case AArch64::BI__builtin_neon_vminq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vminq_v, E); - case AArch64::BI__builtin_neon_vpmax_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpmax_v, E); - case AArch64::BI__builtin_neon_vpmin_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpmin_v, E); - case AArch64::BI__builtin_neon_vpadd_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpadd_v, E); - case AArch64::BI__builtin_neon_vqdmulh_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmulh_v, E); - case AArch64::BI__builtin_neon_vqdmulhq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmulhq_v, E); - case AArch64::BI__builtin_neon_vqrdmulh_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrdmulh_v, E); - case AArch64::BI__builtin_neon_vqrdmulhq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrdmulhq_v, E); - - // Shift by immediate - case AArch64::BI__builtin_neon_vshr_n_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshr_n_v, E); - case AArch64::BI__builtin_neon_vshrq_n_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshrq_n_v, E); - case AArch64::BI__builtin_neon_vrshr_n_v: - case AArch64::BI__builtin_neon_vrshrq_n_v: - Int = usgn ? Intrinsic::aarch64_neon_vurshr - : Intrinsic::aarch64_neon_vsrshr; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n"); - case AArch64::BI__builtin_neon_vsra_n_v: - if (VTy->getElementType()->isIntegerTy(64)) { - Int = usgn ? Intrinsic::aarch64_neon_vsradu_n - : Intrinsic::aarch64_neon_vsrads_n; - return EmitNeonCall(CGM.getIntrinsic(Int), Ops, "vsra_n"); - } - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vsra_n_v, E); - case AArch64::BI__builtin_neon_vsraq_n_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vsraq_n_v, E); - case AArch64::BI__builtin_neon_vrsra_n_v: - if (VTy->getElementType()->isIntegerTy(64)) { - Int = usgn ? Intrinsic::aarch64_neon_vrsradu_n - : Intrinsic::aarch64_neon_vrsrads_n; - return EmitNeonCall(CGM.getIntrinsic(Int), Ops, "vrsra_n"); - } - // fall through - case AArch64::BI__builtin_neon_vrsraq_n_v: { + case NEON::BI__builtin_neon_vcage_v: + case NEON::BI__builtin_neon_vcageq_v: + case NEON::BI__builtin_neon_vcagt_v: + case NEON::BI__builtin_neon_vcagtq_v: { + llvm::Type *VecFlt = llvm::VectorType::get( + VTy->getScalarSizeInBits() == 32 ? FloatTy : DoubleTy, + VTy->getNumElements()); + llvm::Type *Tys[] = { VTy, VecFlt }; + Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); + return EmitNeonCall(F, Ops, NameHint); + } + case NEON::BI__builtin_neon_vclz_v: + case NEON::BI__builtin_neon_vclzq_v: + // We generate target-independent intrinsic, which needs a second argument + // for whether or not clz of zero is undefined; on ARM it isn't. + Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef())); + break; + case NEON::BI__builtin_neon_vcvt_f32_v: + case NEON::BI__builtin_neon_vcvtq_f32_v: Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - Int = usgn ? Intrinsic::aarch64_neon_vurshr - : Intrinsic::aarch64_neon_vsrshr; - Ops[1] = Builder.CreateCall2(CGM.getIntrinsic(Int, Ty), Ops[1], Ops[2]); - return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n"); - } - case AArch64::BI__builtin_neon_vshl_n_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshl_n_v, E); - case AArch64::BI__builtin_neon_vshlq_n_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshlq_n_v, E); - case AArch64::BI__builtin_neon_vqshl_n_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshl_n_v, E); - case AArch64::BI__builtin_neon_vqshlq_n_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshlq_n_v, E); - case AArch64::BI__builtin_neon_vqshlu_n_v: - case AArch64::BI__builtin_neon_vqshluq_n_v: - Int = Intrinsic::aarch64_neon_vsqshlu; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n"); - case AArch64::BI__builtin_neon_vsri_n_v: - case AArch64::BI__builtin_neon_vsriq_n_v: - Int = Intrinsic::aarch64_neon_vsri; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsri_n"); - case AArch64::BI__builtin_neon_vsli_n_v: - case AArch64::BI__builtin_neon_vsliq_n_v: - Int = Intrinsic::aarch64_neon_vsli; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsli_n"); - case AArch64::BI__builtin_neon_vshll_n_v: { - llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy); - Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); - if (usgn) - Ops[0] = Builder.CreateZExt(Ops[0], VTy); - else - Ops[0] = Builder.CreateSExt(Ops[0], VTy); - Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false); - return Builder.CreateShl(Ops[0], Ops[1], "vshll_n"); - } - case AArch64::BI__builtin_neon_vshrn_n_v: { - llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy); - Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); - Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false); - if (usgn) - Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]); - else - Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]); - return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n"); - } - case AArch64::BI__builtin_neon_vqshrun_n_v: - Int = Intrinsic::aarch64_neon_vsqshrun; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n"); - case AArch64::BI__builtin_neon_vrshrn_n_v: - Int = Intrinsic::aarch64_neon_vrshrn; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n"); - case AArch64::BI__builtin_neon_vqrshrun_n_v: - Int = Intrinsic::aarch64_neon_vsqrshrun; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n"); - case AArch64::BI__builtin_neon_vqshrn_n_v: - Int = usgn ? Intrinsic::aarch64_neon_vuqshrn - : Intrinsic::aarch64_neon_vsqshrn; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n"); - case AArch64::BI__builtin_neon_vqrshrn_n_v: - Int = usgn ? Intrinsic::aarch64_neon_vuqrshrn - : Intrinsic::aarch64_neon_vsqrshrn; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n"); - - // Convert - case AArch64::BI__builtin_neon_vmovl_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmovl_v, E); - case AArch64::BI__builtin_neon_vcvt_n_f32_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_n_f32_v, E); - case AArch64::BI__builtin_neon_vcvtq_n_f32_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvtq_n_f32_v, E); - case AArch64::BI__builtin_neon_vcvt_n_f64_v: - case AArch64::BI__builtin_neon_vcvtq_n_f64_v: { + Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad)); + return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") + : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); + case NEON::BI__builtin_neon_vcvt_n_f32_v: + case NEON::BI__builtin_neon_vcvt_n_f64_v: + case NEON::BI__builtin_neon_vcvtq_n_f32_v: + case NEON::BI__builtin_neon_vcvtq_n_f64_v: { + bool Double = + (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64); llvm::Type *FloatTy = - GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad)); + GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64 + : NeonTypeFlags::Float32, + false, Quad)); llvm::Type *Tys[2] = { FloatTy, Ty }; - Int = usgn ? Intrinsic::arm_neon_vcvtfxu2fp - : Intrinsic::arm_neon_vcvtfxs2fp; + Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; Function *F = CGM.getIntrinsic(Int, Tys); return EmitNeonCall(F, Ops, "vcvt_n"); } - case AArch64::BI__builtin_neon_vcvt_n_s32_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_n_s32_v, E); - case AArch64::BI__builtin_neon_vcvtq_n_s32_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvtq_n_s32_v, E); - case AArch64::BI__builtin_neon_vcvt_n_u32_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_n_u32_v, E); - case AArch64::BI__builtin_neon_vcvtq_n_u32_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvtq_n_u32_v, E); - case AArch64::BI__builtin_neon_vcvt_n_s64_v: - case AArch64::BI__builtin_neon_vcvt_n_u64_v: - case AArch64::BI__builtin_neon_vcvtq_n_s64_v: - case AArch64::BI__builtin_neon_vcvtq_n_u64_v: { + case NEON::BI__builtin_neon_vcvt_n_s32_v: + case NEON::BI__builtin_neon_vcvt_n_u32_v: + case NEON::BI__builtin_neon_vcvt_n_s64_v: + case NEON::BI__builtin_neon_vcvt_n_u64_v: + case NEON::BI__builtin_neon_vcvtq_n_s32_v: + case NEON::BI__builtin_neon_vcvtq_n_u32_v: + case NEON::BI__builtin_neon_vcvtq_n_s64_v: + case NEON::BI__builtin_neon_vcvtq_n_u64_v: { + bool Double = + (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64); llvm::Type *FloatTy = - GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad)); + GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64 + : NeonTypeFlags::Float32, + false, Quad)); llvm::Type *Tys[2] = { Ty, FloatTy }; - Int = usgn ? Intrinsic::arm_neon_vcvtfp2fxu - : Intrinsic::arm_neon_vcvtfp2fxs; - Function *F = CGM.getIntrinsic(Int, Tys); + Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); return EmitNeonCall(F, Ops, "vcvt_n"); } + case NEON::BI__builtin_neon_vcvt_s32_v: + case NEON::BI__builtin_neon_vcvt_u32_v: + case NEON::BI__builtin_neon_vcvt_s64_v: + case NEON::BI__builtin_neon_vcvt_u64_v: + case NEON::BI__builtin_neon_vcvtq_s32_v: + case NEON::BI__builtin_neon_vcvtq_u32_v: + case NEON::BI__builtin_neon_vcvtq_s64_v: + case NEON::BI__builtin_neon_vcvtq_u64_v: { + bool Double = + (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64); + llvm::Type *FloatTy = + GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64 + : NeonTypeFlags::Float32, + false, Quad)); + Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy); + return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt") + : Builder.CreateFPToSI(Ops[0], Ty, "vcvt"); + } + case NEON::BI__builtin_neon_vcvta_s32_v: + case NEON::BI__builtin_neon_vcvta_s64_v: + case NEON::BI__builtin_neon_vcvta_u32_v: + case NEON::BI__builtin_neon_vcvta_u64_v: + case NEON::BI__builtin_neon_vcvtaq_s32_v: + case NEON::BI__builtin_neon_vcvtaq_s64_v: + case NEON::BI__builtin_neon_vcvtaq_u32_v: + case NEON::BI__builtin_neon_vcvtaq_u64_v: + case NEON::BI__builtin_neon_vcvtn_s32_v: + case NEON::BI__builtin_neon_vcvtn_s64_v: + case NEON::BI__builtin_neon_vcvtn_u32_v: + case NEON::BI__builtin_neon_vcvtn_u64_v: + case NEON::BI__builtin_neon_vcvtnq_s32_v: + case NEON::BI__builtin_neon_vcvtnq_s64_v: + case NEON::BI__builtin_neon_vcvtnq_u32_v: + case NEON::BI__builtin_neon_vcvtnq_u64_v: + case NEON::BI__builtin_neon_vcvtp_s32_v: + case NEON::BI__builtin_neon_vcvtp_s64_v: + case NEON::BI__builtin_neon_vcvtp_u32_v: + case NEON::BI__builtin_neon_vcvtp_u64_v: + case NEON::BI__builtin_neon_vcvtpq_s32_v: + case NEON::BI__builtin_neon_vcvtpq_s64_v: + case NEON::BI__builtin_neon_vcvtpq_u32_v: + case NEON::BI__builtin_neon_vcvtpq_u64_v: + case NEON::BI__builtin_neon_vcvtm_s32_v: + case NEON::BI__builtin_neon_vcvtm_s64_v: + case NEON::BI__builtin_neon_vcvtm_u32_v: + case NEON::BI__builtin_neon_vcvtm_u64_v: + case NEON::BI__builtin_neon_vcvtmq_s32_v: + case NEON::BI__builtin_neon_vcvtmq_s64_v: + case NEON::BI__builtin_neon_vcvtmq_u32_v: + case NEON::BI__builtin_neon_vcvtmq_u64_v: { + bool Double = + (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64); + llvm::Type *InTy = + GetNeonType(this, + NeonTypeFlags(Double ? NeonTypeFlags::Float64 + : NeonTypeFlags::Float32, false, Quad)); + llvm::Type *Tys[2] = { Ty, InTy }; + return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint); + } + case NEON::BI__builtin_neon_vext_v: + case NEON::BI__builtin_neon_vextq_v: { + int CV = cast<ConstantInt>(Ops[2])->getSExtValue(); + SmallVector<Constant*, 16> Indices; + for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) + Indices.push_back(ConstantInt::get(Int32Ty, i+CV)); - // Load/Store - case AArch64::BI__builtin_neon_vld1_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld1_v, E); - case AArch64::BI__builtin_neon_vld1q_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld1q_v, E); - case AArch64::BI__builtin_neon_vld2_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld2_v, E); - case AArch64::BI__builtin_neon_vld2q_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld2q_v, E); - case AArch64::BI__builtin_neon_vld3_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld3_v, E); - case AArch64::BI__builtin_neon_vld3q_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld3q_v, E); - case AArch64::BI__builtin_neon_vld4_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld4_v, E); - case AArch64::BI__builtin_neon_vld4q_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld4q_v, E); - case AArch64::BI__builtin_neon_vst1_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst1_v, E); - case AArch64::BI__builtin_neon_vst1q_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst1q_v, E); - case AArch64::BI__builtin_neon_vst2_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst2_v, E); - case AArch64::BI__builtin_neon_vst2q_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst2q_v, E); - case AArch64::BI__builtin_neon_vst3_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst3_v, E); - case AArch64::BI__builtin_neon_vst3q_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst3q_v, E); - case AArch64::BI__builtin_neon_vst4_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst4_v, E); - case AArch64::BI__builtin_neon_vst4q_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst4q_v, E); - case AArch64::BI__builtin_neon_vld1_x2_v: - case AArch64::BI__builtin_neon_vld1q_x2_v: - case AArch64::BI__builtin_neon_vld1_x3_v: - case AArch64::BI__builtin_neon_vld1q_x3_v: - case AArch64::BI__builtin_neon_vld1_x4_v: - case AArch64::BI__builtin_neon_vld1q_x4_v: { - unsigned Int; - switch (BuiltinID) { - case AArch64::BI__builtin_neon_vld1_x2_v: - case AArch64::BI__builtin_neon_vld1q_x2_v: - Int = Intrinsic::aarch64_neon_vld1x2; - break; - case AArch64::BI__builtin_neon_vld1_x3_v: - case AArch64::BI__builtin_neon_vld1q_x3_v: - Int = Intrinsic::aarch64_neon_vld1x3; - break; - case AArch64::BI__builtin_neon_vld1_x4_v: - case AArch64::BI__builtin_neon_vld1q_x4_v: - Int = Intrinsic::aarch64_neon_vld1x4; - break; - } - Function *F = CGM.getIntrinsic(Int, Ty); - Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld1xN"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateStore(Ops[1], Ops[0]); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Value *SV = llvm::ConstantVector::get(Indices); + return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vext"); + } + case NEON::BI__builtin_neon_vfma_v: + case NEON::BI__builtin_neon_vfmaq_v: { + Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + + // NEON intrinsic puts accumulator first, unlike the LLVM fma. + return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]); } - case AArch64::BI__builtin_neon_vst1_x2_v: - case AArch64::BI__builtin_neon_vst1q_x2_v: - case AArch64::BI__builtin_neon_vst1_x3_v: - case AArch64::BI__builtin_neon_vst1q_x3_v: - case AArch64::BI__builtin_neon_vst1_x4_v: - case AArch64::BI__builtin_neon_vst1q_x4_v: { + case NEON::BI__builtin_neon_vld1_v: + case NEON::BI__builtin_neon_vld1q_v: Ops.push_back(Align); - unsigned Int; - switch (BuiltinID) { - case AArch64::BI__builtin_neon_vst1_x2_v: - case AArch64::BI__builtin_neon_vst1q_x2_v: - Int = Intrinsic::aarch64_neon_vst1x2; - break; - case AArch64::BI__builtin_neon_vst1_x3_v: - case AArch64::BI__builtin_neon_vst1q_x3_v: - Int = Intrinsic::aarch64_neon_vst1x3; - break; - case AArch64::BI__builtin_neon_vst1_x4_v: - case AArch64::BI__builtin_neon_vst1q_x4_v: - Int = Intrinsic::aarch64_neon_vst1x4; - break; - } - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, ""); + return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vld1"); + case NEON::BI__builtin_neon_vld2_v: + case NEON::BI__builtin_neon_vld2q_v: + case NEON::BI__builtin_neon_vld3_v: + case NEON::BI__builtin_neon_vld3q_v: + case NEON::BI__builtin_neon_vld4_v: + case NEON::BI__builtin_neon_vld4q_v: { + Function *F = CGM.getIntrinsic(LLVMIntrinsic, Ty); + Ops[1] = Builder.CreateCall2(F, Ops[1], Align, NameHint); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); } - case AArch64::BI__builtin_neon_vld1_lane_v: - case AArch64::BI__builtin_neon_vld1q_lane_v: { - Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + case NEON::BI__builtin_neon_vld1_dup_v: + case NEON::BI__builtin_neon_vld1q_dup_v: { + Value *V = UndefValue::get(Ty); Ty = llvm::PointerType::getUnqual(VTy->getElementType()); Ops[0] = Builder.CreateBitCast(Ops[0], Ty); LoadInst *Ld = Builder.CreateLoad(Ops[0]); Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue()); - return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane"); - } - case AArch64::BI__builtin_neon_vld2_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld2q_lane_v, E); - case AArch64::BI__builtin_neon_vld2q_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld2q_lane_v, E); - case AArch64::BI__builtin_neon_vld3_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld3_lane_v, E); - case AArch64::BI__builtin_neon_vld3q_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld3q_lane_v, E); - case AArch64::BI__builtin_neon_vld4_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld4_lane_v, E); - case AArch64::BI__builtin_neon_vld4q_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld4q_lane_v, E); - case AArch64::BI__builtin_neon_vst1_lane_v: - case AArch64::BI__builtin_neon_vst1q_lane_v: { - Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - StoreInst *St = - Builder.CreateStore(Ops[1], Builder.CreateBitCast(Ops[0], Ty)); - St->setAlignment(cast<ConstantInt>(Align)->getZExtValue()); - return St; + llvm::Constant *CI = ConstantInt::get(SizeTy, 0); + Ops[0] = Builder.CreateInsertElement(V, Ld, CI); + return EmitNeonSplat(Ops[0], CI); } - case AArch64::BI__builtin_neon_vst2_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst2_lane_v, E); - case AArch64::BI__builtin_neon_vst2q_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst2q_lane_v, E); - case AArch64::BI__builtin_neon_vst3_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst3_lane_v, E); - case AArch64::BI__builtin_neon_vst3q_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst3q_lane_v, E); - case AArch64::BI__builtin_neon_vst4_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst4_lane_v, E); - case AArch64::BI__builtin_neon_vst4q_lane_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst4q_lane_v, E); - case AArch64::BI__builtin_neon_vld1_dup_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld1_dup_v, E); - case AArch64::BI__builtin_neon_vld1q_dup_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld1q_dup_v, E); - case AArch64::BI__builtin_neon_vld2_dup_v: - case AArch64::BI__builtin_neon_vld2q_dup_v: - case AArch64::BI__builtin_neon_vld3_dup_v: - case AArch64::BI__builtin_neon_vld3q_dup_v: - case AArch64::BI__builtin_neon_vld4_dup_v: - case AArch64::BI__builtin_neon_vld4q_dup_v: { - // Handle 64-bit x 1 elements as a special-case. There is no "dup" needed. - if (VTy->getElementType()->getPrimitiveSizeInBits() == 64 && - VTy->getNumElements() == 1) { - switch (BuiltinID) { - case AArch64::BI__builtin_neon_vld2_dup_v: - Int = Intrinsic::arm_neon_vld2; - break; - case AArch64::BI__builtin_neon_vld3_dup_v: - Int = Intrinsic::arm_neon_vld3; - break; - case AArch64::BI__builtin_neon_vld4_dup_v: - Int = Intrinsic::arm_neon_vld4; - break; - default: - llvm_unreachable("unknown vld_dup intrinsic?"); - } - Function *F = CGM.getIntrinsic(Int, Ty); - Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld_dup"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateStore(Ops[1], Ops[0]); - } - switch (BuiltinID) { - case AArch64::BI__builtin_neon_vld2_dup_v: - case AArch64::BI__builtin_neon_vld2q_dup_v: - Int = Intrinsic::arm_neon_vld2lane; - break; - case AArch64::BI__builtin_neon_vld3_dup_v: - case AArch64::BI__builtin_neon_vld3q_dup_v: - Int = Intrinsic::arm_neon_vld3lane; - break; - case AArch64::BI__builtin_neon_vld4_dup_v: - case AArch64::BI__builtin_neon_vld4q_dup_v: - Int = Intrinsic::arm_neon_vld4lane; - break; - } - Function *F = CGM.getIntrinsic(Int, Ty); - llvm::StructType *STy = cast<llvm::StructType>(F->getReturnType()); - - SmallVector<Value *, 6> Args; - Args.push_back(Ops[1]); - Args.append(STy->getNumElements(), UndefValue::get(Ty)); - - llvm::Constant *CI = ConstantInt::get(Int32Ty, 0); - Args.push_back(CI); - Args.push_back(Align); - - Ops[1] = Builder.CreateCall(F, Args, "vld_dup"); - // splat lane 0 to all elts in each vector of the result. - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - Value *Val = Builder.CreateExtractValue(Ops[1], i); - Value *Elt = Builder.CreateBitCast(Val, Ty); - Elt = EmitNeonSplat(Elt, CI); - Elt = Builder.CreateBitCast(Elt, Val->getType()); - Ops[1] = Builder.CreateInsertValue(Ops[1], Elt, i); - } + case NEON::BI__builtin_neon_vld2_lane_v: + case NEON::BI__builtin_neon_vld2q_lane_v: + case NEON::BI__builtin_neon_vld3_lane_v: + case NEON::BI__builtin_neon_vld3q_lane_v: + case NEON::BI__builtin_neon_vld4_lane_v: + case NEON::BI__builtin_neon_vld4q_lane_v: { + Function *F = CGM.getIntrinsic(LLVMIntrinsic, Ty); + for (unsigned I = 2; I < Ops.size() - 1; ++I) + Ops[I] = Builder.CreateBitCast(Ops[I], Ty); + Ops.push_back(Align); + Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), NameHint); Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); Ops[0] = Builder.CreateBitCast(Ops[0], Ty); return Builder.CreateStore(Ops[1], Ops[0]); } + case NEON::BI__builtin_neon_vmovl_v: { + llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy); + Ops[0] = Builder.CreateBitCast(Ops[0], DTy); + if (Usgn) + return Builder.CreateZExt(Ops[0], Ty, "vmovl"); + return Builder.CreateSExt(Ops[0], Ty, "vmovl"); + } + case NEON::BI__builtin_neon_vmovn_v: { + llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy); + Ops[0] = Builder.CreateBitCast(Ops[0], QTy); + return Builder.CreateTrunc(Ops[0], Ty, "vmovn"); + } + case NEON::BI__builtin_neon_vmull_v: + // FIXME: the integer vmull operations could be emitted in terms of pure + // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of + // hoisting the exts outside loops. Until global ISel comes along that can + // see through such movement this leads to bad CodeGen. So we need an + // intrinsic for now. + Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls; + Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull"); + case NEON::BI__builtin_neon_vpadal_v: + case NEON::BI__builtin_neon_vpadalq_v: { + // The source operand type has twice as many elements of half the size. + unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits(); + llvm::Type *EltTy = + llvm::IntegerType::get(getLLVMContext(), EltBits / 2); + llvm::Type *NarrowTy = + llvm::VectorType::get(EltTy, VTy->getNumElements() * 2); + llvm::Type *Tys[2] = { Ty, NarrowTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint); + } + case NEON::BI__builtin_neon_vpaddl_v: + case NEON::BI__builtin_neon_vpaddlq_v: { + // The source operand type has twice as many elements of half the size. + unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits(); + llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2); + llvm::Type *NarrowTy = + llvm::VectorType::get(EltTy, VTy->getNumElements() * 2); + llvm::Type *Tys[2] = { Ty, NarrowTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl"); + } + case NEON::BI__builtin_neon_vqdmlal_v: + case NEON::BI__builtin_neon_vqdmlsl_v: { + SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end()); + Value *Mul = EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), + MulOps, "vqdmlal"); - // Crypto - case AArch64::BI__builtin_neon_vaeseq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aese, Ty), - Ops, "aese"); - case AArch64::BI__builtin_neon_vaesdq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aesd, Ty), - Ops, "aesd"); - case AArch64::BI__builtin_neon_vaesmcq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aesmc, Ty), - Ops, "aesmc"); - case AArch64::BI__builtin_neon_vaesimcq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aesimc, Ty), - Ops, "aesimc"); - case AArch64::BI__builtin_neon_vsha1su1q_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1su1, Ty), - Ops, "sha1su1"); - case AArch64::BI__builtin_neon_vsha256su0q_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256su0, Ty), - Ops, "sha256su0"); - case AArch64::BI__builtin_neon_vsha1su0q_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1su0, Ty), - Ops, "sha1su0"); - case AArch64::BI__builtin_neon_vsha256hq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256h, Ty), - Ops, "sha256h"); - case AArch64::BI__builtin_neon_vsha256h2q_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256h2, Ty), - Ops, "sha256h2"); - case AArch64::BI__builtin_neon_vsha256su1q_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256su1, Ty), - Ops, "sha256su1"); - case AArch64::BI__builtin_neon_vmul_lane_v: - case AArch64::BI__builtin_neon_vmul_laneq_v: { - // v1f64 vmul_lane should be mapped to Neon scalar mul lane - bool Quad = false; - if (BuiltinID == AArch64::BI__builtin_neon_vmul_laneq_v) - Quad = true; - Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); - llvm::Type *VTy = GetNeonType(this, - NeonTypeFlags(NeonTypeFlags::Float64, false, Quad)); - Ops[1] = Builder.CreateBitCast(Ops[1], VTy); - Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); - Value *Result = Builder.CreateFMul(Ops[0], Ops[1]); - return Builder.CreateBitCast(Result, Ty); + SmallVector<Value *, 2> AccumOps; + AccumOps.push_back(Ops[0]); + AccumOps.push_back(Mul); + return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), + AccumOps, NameHint); + } + case NEON::BI__builtin_neon_vqshl_n_v: + case NEON::BI__builtin_neon_vqshlq_n_v: + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n", + 1, false); + case NEON::BI__builtin_neon_vrecpe_v: + case NEON::BI__builtin_neon_vrecpeq_v: + case NEON::BI__builtin_neon_vrsqrte_v: + case NEON::BI__builtin_neon_vrsqrteq_v: + Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint); + + case NEON::BI__builtin_neon_vshl_n_v: + case NEON::BI__builtin_neon_vshlq_n_v: + Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false); + return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1], + "vshl_n"); + case NEON::BI__builtin_neon_vshll_n_v: { + llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy); + Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); + if (Usgn) + Ops[0] = Builder.CreateZExt(Ops[0], VTy); + else + Ops[0] = Builder.CreateSExt(Ops[0], VTy); + Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false); + return Builder.CreateShl(Ops[0], Ops[1], "vshll_n"); } + case NEON::BI__builtin_neon_vshrn_n_v: { + llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy); + Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); + Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false); + if (Usgn) + Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]); + else + Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]); + return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n"); + } + case NEON::BI__builtin_neon_vshr_n_v: + case NEON::BI__builtin_neon_vshrq_n_v: + return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n"); + case NEON::BI__builtin_neon_vst1_v: + case NEON::BI__builtin_neon_vst1q_v: + case NEON::BI__builtin_neon_vst2_v: + case NEON::BI__builtin_neon_vst2q_v: + case NEON::BI__builtin_neon_vst3_v: + case NEON::BI__builtin_neon_vst3q_v: + case NEON::BI__builtin_neon_vst4_v: + case NEON::BI__builtin_neon_vst4q_v: + case NEON::BI__builtin_neon_vst2_lane_v: + case NEON::BI__builtin_neon_vst2q_lane_v: + case NEON::BI__builtin_neon_vst3_lane_v: + case NEON::BI__builtin_neon_vst3q_lane_v: + case NEON::BI__builtin_neon_vst4_lane_v: + case NEON::BI__builtin_neon_vst4q_lane_v: + Ops.push_back(Align); + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, ""); + case NEON::BI__builtin_neon_vsubhn_v: { + llvm::VectorType *SrcTy = + llvm::VectorType::getExtendedElementVectorType(VTy); - // AArch64-only builtins - case AArch64::BI__builtin_neon_vfmaq_laneq_v: { - Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + // %sum = add <4 x i32> %lhs, %rhs + Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); + Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy); + Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn"); - Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3])); - return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]); + // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> + Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(), + SrcTy->getScalarSizeInBits() / 2); + ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt); + Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn"); + + // %res = trunc <4 x i32> %high to <4 x i16> + return Builder.CreateTrunc(Ops[0], VTy, "vsubhn"); } - case AArch64::BI__builtin_neon_vfmaq_lane_v: { - Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + case NEON::BI__builtin_neon_vtrn_v: + case NEON::BI__builtin_neon_vtrnq_v: { + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Value *SV = nullptr; - llvm::VectorType *VTy = cast<llvm::VectorType>(Ty); - llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(), - VTy->getNumElements() / 2); - Ops[2] = Builder.CreateBitCast(Ops[2], STy); - Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), - cast<ConstantInt>(Ops[3])); - Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane"); - - return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]); - } - case AArch64::BI__builtin_neon_vfma_lane_v: { - llvm::VectorType *VTy = cast<llvm::VectorType>(Ty); - // v1f64 fma should be mapped to Neon scalar f64 fma - if (VTy && VTy->getElementType() == DoubleTy) { - Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); - Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy); - llvm::Type *VTy = GetNeonType(this, - NeonTypeFlags(NeonTypeFlags::Float64, false, false)); - Ops[2] = Builder.CreateBitCast(Ops[2], VTy); - Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); - Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy); - Value *Result = Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]); - return Builder.CreateBitCast(Result, Ty); + for (unsigned vi = 0; vi != 2; ++vi) { + SmallVector<Constant*, 16> Indices; + for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { + Indices.push_back(Builder.getInt32(i+vi)); + Indices.push_back(Builder.getInt32(i+e+vi)); + } + Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi); + SV = llvm::ConstantVector::get(Indices); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vtrn"); + SV = Builder.CreateStore(SV, Addr); } - Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); + return SV; + } + case NEON::BI__builtin_neon_vtst_v: + case NEON::BI__builtin_neon_vtstq_v: { Ops[0] = Builder.CreateBitCast(Ops[0], Ty); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - - Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3])); - return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]); + Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]); + Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0], + ConstantAggregateZero::get(Ty)); + return Builder.CreateSExt(Ops[0], Ty, "vtst"); } - case AArch64::BI__builtin_neon_vfma_laneq_v: { - llvm::VectorType *VTy = cast<llvm::VectorType>(Ty); - // v1f64 fma should be mapped to Neon scalar f64 fma - if (VTy && VTy->getElementType() == DoubleTy) { - Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); - Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy); - llvm::Type *VTy = GetNeonType(this, - NeonTypeFlags(NeonTypeFlags::Float64, false, true)); - Ops[2] = Builder.CreateBitCast(Ops[2], VTy); - Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); - Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy); - Value *Result = Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]); - return Builder.CreateBitCast(Result, Ty); - } - Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + case NEON::BI__builtin_neon_vuzp_v: + case NEON::BI__builtin_neon_vuzpq_v: { + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Value *SV = nullptr; - llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(), - VTy->getNumElements() * 2); - Ops[2] = Builder.CreateBitCast(Ops[2], STy); - Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), - cast<ConstantInt>(Ops[3])); - Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane"); + for (unsigned vi = 0; vi != 2; ++vi) { + SmallVector<Constant*, 16> Indices; + for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) + Indices.push_back(ConstantInt::get(Int32Ty, 2*i+vi)); - return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]); + Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi); + SV = llvm::ConstantVector::get(Indices); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vuzp"); + SV = Builder.CreateStore(SV, Addr); + } + return SV; } - case AArch64::BI__builtin_neon_vfms_v: - case AArch64::BI__builtin_neon_vfmsq_v: { - Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + case NEON::BI__builtin_neon_vzip_v: + case NEON::BI__builtin_neon_vzipq_v: { + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - Ops[1] = Builder.CreateFNeg(Ops[1]); Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Value *SV = nullptr; - // LLVM's fma intrinsic puts the accumulator in the last position, but the - // AArch64 intrinsic has it first. - return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]); - } - case AArch64::BI__builtin_neon_vmaxnm_v: - case AArch64::BI__builtin_neon_vmaxnmq_v: { - Int = Intrinsic::aarch64_neon_vmaxnm; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm"); - } - case AArch64::BI__builtin_neon_vminnm_v: - case AArch64::BI__builtin_neon_vminnmq_v: { - Int = Intrinsic::aarch64_neon_vminnm; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm"); - } - case AArch64::BI__builtin_neon_vpmaxnm_v: - case AArch64::BI__builtin_neon_vpmaxnmq_v: { - Int = Intrinsic::aarch64_neon_vpmaxnm; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm"); - } - case AArch64::BI__builtin_neon_vpminnm_v: - case AArch64::BI__builtin_neon_vpminnmq_v: { - Int = Intrinsic::aarch64_neon_vpminnm; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm"); - } - case AArch64::BI__builtin_neon_vpmaxq_v: { - Int = usgn ? Intrinsic::arm_neon_vpmaxu : Intrinsic::arm_neon_vpmaxs; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax"); - } - case AArch64::BI__builtin_neon_vpminq_v: { - Int = usgn ? Intrinsic::arm_neon_vpminu : Intrinsic::arm_neon_vpmins; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin"); - } - case AArch64::BI__builtin_neon_vpaddq_v: { - Int = Intrinsic::arm_neon_vpadd; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpadd"); - } - case AArch64::BI__builtin_neon_vmulx_v: - case AArch64::BI__builtin_neon_vmulxq_v: { - Int = Intrinsic::aarch64_neon_vmulx; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx"); - } - case AArch64::BI__builtin_neon_vpaddl_v: - case AArch64::BI__builtin_neon_vpaddlq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpaddl_v, E); - case AArch64::BI__builtin_neon_vpadal_v: - case AArch64::BI__builtin_neon_vpadalq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpadal_v, E); - case AArch64::BI__builtin_neon_vqabs_v: - case AArch64::BI__builtin_neon_vqabsq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqabs_v, E); - case AArch64::BI__builtin_neon_vqneg_v: - case AArch64::BI__builtin_neon_vqnegq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqneg_v, E); - case AArch64::BI__builtin_neon_vabs_v: - case AArch64::BI__builtin_neon_vabsq_v: { - if (VTy->getElementType()->isFloatingPointTy()) { - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs"); + for (unsigned vi = 0; vi != 2; ++vi) { + SmallVector<Constant*, 16> Indices; + for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { + Indices.push_back(ConstantInt::get(Int32Ty, (i + vi*e) >> 1)); + Indices.push_back(ConstantInt::get(Int32Ty, ((i + vi*e) >> 1)+e)); + } + Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi); + SV = llvm::ConstantVector::get(Indices); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vzip"); + SV = Builder.CreateStore(SV, Addr); } - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vabs_v, E); - } - case AArch64::BI__builtin_neon_vsqadd_v: - case AArch64::BI__builtin_neon_vsqaddq_v: { - Int = Intrinsic::aarch64_neon_usqadd; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd"); - } - case AArch64::BI__builtin_neon_vuqadd_v: - case AArch64::BI__builtin_neon_vuqaddq_v: { - Int = Intrinsic::aarch64_neon_suqadd; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd"); - } - case AArch64::BI__builtin_neon_vcls_v: - case AArch64::BI__builtin_neon_vclsq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcls_v, E); - case AArch64::BI__builtin_neon_vclz_v: - case AArch64::BI__builtin_neon_vclzq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vclz_v, E); - case AArch64::BI__builtin_neon_vcnt_v: - case AArch64::BI__builtin_neon_vcntq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcnt_v, E); - case AArch64::BI__builtin_neon_vrbit_v: - case AArch64::BI__builtin_neon_vrbitq_v: - Int = Intrinsic::aarch64_neon_rbit; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit"); - case AArch64::BI__builtin_neon_vmovn_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmovn_v, E); - case AArch64::BI__builtin_neon_vqmovun_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqmovun_v, E); - case AArch64::BI__builtin_neon_vqmovn_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqmovn_v, E); - case AArch64::BI__builtin_neon_vcvt_f16_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_f16_v, E); - case AArch64::BI__builtin_neon_vcvt_f32_f16: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_f32_f16, E); - case AArch64::BI__builtin_neon_vcvt_f32_f64: { - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, false)); - return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt"); - } - case AArch64::BI__builtin_neon_vcvtx_f32_v: { - llvm::Type *EltTy = FloatTy; - llvm::Type *ResTy = llvm::VectorType::get(EltTy, 2); - llvm::Type *Tys[2] = { ResTy, Ty }; - Int = Intrinsic::aarch64_neon_fcvtxn; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtx_f32_f64"); - } - case AArch64::BI__builtin_neon_vcvt_f64_f32: { - llvm::Type *OpTy = - GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, false)); - Ops[0] = Builder.CreateBitCast(Ops[0], OpTy); - return Builder.CreateFPExt(Ops[0], Ty, "vcvt"); - } - case AArch64::BI__builtin_neon_vcvt_f64_v: - case AArch64::BI__builtin_neon_vcvtq_f64_v: { - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad)); - return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") - : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); - } - case AArch64::BI__builtin_neon_vrndn_v: - case AArch64::BI__builtin_neon_vrndnq_v: { - Int = Intrinsic::aarch64_neon_frintn; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn"); - } - case AArch64::BI__builtin_neon_vrnda_v: - case AArch64::BI__builtin_neon_vrndaq_v: { - Int = Intrinsic::round; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda"); - } - case AArch64::BI__builtin_neon_vrndp_v: - case AArch64::BI__builtin_neon_vrndpq_v: { - Int = Intrinsic::ceil; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp"); - } - case AArch64::BI__builtin_neon_vrndm_v: - case AArch64::BI__builtin_neon_vrndmq_v: { - Int = Intrinsic::floor; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm"); - } - case AArch64::BI__builtin_neon_vrndx_v: - case AArch64::BI__builtin_neon_vrndxq_v: { - Int = Intrinsic::rint; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx"); + return SV; } - case AArch64::BI__builtin_neon_vrnd_v: - case AArch64::BI__builtin_neon_vrndq_v: { - Int = Intrinsic::trunc; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd"); } - case AArch64::BI__builtin_neon_vrndi_v: - case AArch64::BI__builtin_neon_vrndiq_v: { - Int = Intrinsic::nearbyint; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi"); + + assert(Int && "Expected valid intrinsic number"); + + // Determine the type(s) of this overloaded AArch64 intrinsic. + Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E); + + Value *Result = EmitNeonCall(F, Ops, NameHint); + llvm::Type *ResultType = ConvertType(E->getType()); + // AArch64 intrinsic one-element vector type cast to + // scalar type expected by the builtin + return Builder.CreateBitCast(Result, ResultType, NameHint); +} + +Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr( + Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp, + const CmpInst::Predicate Ip, const Twine &Name) { + llvm::Type *OTy = Op->getType(); + + // FIXME: this is utterly horrific. We should not be looking at previous + // codegen context to find out what needs doing. Unfortunately TableGen + // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32 + // (etc). + if (BitCastInst *BI = dyn_cast<BitCastInst>(Op)) + OTy = BI->getOperand(0)->getType(); + + Op = Builder.CreateBitCast(Op, OTy); + if (OTy->getScalarType()->isFloatingPointTy()) { + Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy)); + } else { + Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy)); } - case AArch64::BI__builtin_neon_vcvt_s32_v: - case AArch64::BI__builtin_neon_vcvt_u32_v: - case AArch64::BI__builtin_neon_vcvtq_s32_v: - case AArch64::BI__builtin_neon_vcvtq_u32_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvtq_u32_v, E); - case AArch64::BI__builtin_neon_vcvt_s64_v: - case AArch64::BI__builtin_neon_vcvt_u64_v: - case AArch64::BI__builtin_neon_vcvtq_s64_v: - case AArch64::BI__builtin_neon_vcvtq_u64_v: { - llvm::Type *DoubleTy = - GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad)); - Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); - return usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt") - : Builder.CreateFPToSI(Ops[0], Ty, "vcvt"); + return Builder.CreateSExt(Op, Ty, Name); +} + +static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops, + Value *ExtOp, Value *IndexOp, + llvm::Type *ResTy, unsigned IntID, + const char *Name) { + SmallVector<Value *, 2> TblOps; + if (ExtOp) + TblOps.push_back(ExtOp); + + // Build a vector containing sequential number like (0, 1, 2, ..., 15) + SmallVector<Constant*, 16> Indices; + llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType()); + for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) { + Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i)); + Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i+1)); } - case AArch64::BI__builtin_neon_vcvtn_s32_v: - case AArch64::BI__builtin_neon_vcvtnq_s32_v: { - llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtns; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtns_f32"); - } - case AArch64::BI__builtin_neon_vcvtn_s64_v: - case AArch64::BI__builtin_neon_vcvtnq_s64_v: { - llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtns; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtns_f64"); - } - case AArch64::BI__builtin_neon_vcvtn_u32_v: - case AArch64::BI__builtin_neon_vcvtnq_u32_v: { - llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtnu; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtnu_f32"); - } - case AArch64::BI__builtin_neon_vcvtn_u64_v: - case AArch64::BI__builtin_neon_vcvtnq_u64_v: { - llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtnu; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtnu_f64"); - } - case AArch64::BI__builtin_neon_vcvtp_s32_v: - case AArch64::BI__builtin_neon_vcvtpq_s32_v: { - llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtps; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtps_f32"); - } - case AArch64::BI__builtin_neon_vcvtp_s64_v: - case AArch64::BI__builtin_neon_vcvtpq_s64_v: { - llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtps; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtps_f64"); - } - case AArch64::BI__builtin_neon_vcvtp_u32_v: - case AArch64::BI__builtin_neon_vcvtpq_u32_v: { - llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtpu; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtpu_f32"); - } - case AArch64::BI__builtin_neon_vcvtp_u64_v: - case AArch64::BI__builtin_neon_vcvtpq_u64_v: { - llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtpu; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtpu_f64"); - } - case AArch64::BI__builtin_neon_vcvtm_s32_v: - case AArch64::BI__builtin_neon_vcvtmq_s32_v: { - llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtms; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtms_f32"); - } - case AArch64::BI__builtin_neon_vcvtm_s64_v: - case AArch64::BI__builtin_neon_vcvtmq_s64_v: { - llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtms; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtms_f64"); - } - case AArch64::BI__builtin_neon_vcvtm_u32_v: - case AArch64::BI__builtin_neon_vcvtmq_u32_v: { - llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtmu; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtmu_f32"); - } - case AArch64::BI__builtin_neon_vcvtm_u64_v: - case AArch64::BI__builtin_neon_vcvtmq_u64_v: { - llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtmu; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtmu_f64"); - } - case AArch64::BI__builtin_neon_vcvta_s32_v: - case AArch64::BI__builtin_neon_vcvtaq_s32_v: { - llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtas; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtas_f32"); - } - case AArch64::BI__builtin_neon_vcvta_s64_v: - case AArch64::BI__builtin_neon_vcvtaq_s64_v: { - llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtas; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtas_f64"); - } - case AArch64::BI__builtin_neon_vcvta_u32_v: - case AArch64::BI__builtin_neon_vcvtaq_u32_v: { - llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtau; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtau_f32"); - } - case AArch64::BI__builtin_neon_vcvta_u64_v: - case AArch64::BI__builtin_neon_vcvtaq_u64_v: { - llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements()); - llvm::Type *Tys[2] = { Ty, OpTy }; - Int = Intrinsic::aarch64_neon_fcvtau; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtau_f64"); - } - case AArch64::BI__builtin_neon_vrecpe_v: - case AArch64::BI__builtin_neon_vrecpeq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrecpe_v, E); - case AArch64::BI__builtin_neon_vrsqrte_v: - case AArch64::BI__builtin_neon_vrsqrteq_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsqrte_v, E); - case AArch64::BI__builtin_neon_vsqrt_v: - case AArch64::BI__builtin_neon_vsqrtq_v: { - Int = Intrinsic::sqrt; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt"); + Value *SV = llvm::ConstantVector::get(Indices); + + int PairPos = 0, End = Ops.size() - 1; + while (PairPos < End) { + TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos], + Ops[PairPos+1], SV, Name)); + PairPos += 2; } - case AArch64::BI__builtin_neon_vcvt_f32_v: - case AArch64::BI__builtin_neon_vcvtq_f32_v: - return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_f32_v, E); - case AArch64::BI__builtin_neon_vceqz_v: - case AArch64::BI__builtin_neon_vceqzq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ, - ICmpInst::ICMP_EQ, "vceqz"); - case AArch64::BI__builtin_neon_vcgez_v: - case AArch64::BI__builtin_neon_vcgezq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE, - ICmpInst::ICMP_SGE, "vcgez"); - case AArch64::BI__builtin_neon_vclez_v: - case AArch64::BI__builtin_neon_vclezq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE, - ICmpInst::ICMP_SLE, "vclez"); - case AArch64::BI__builtin_neon_vcgtz_v: - case AArch64::BI__builtin_neon_vcgtzq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT, - ICmpInst::ICMP_SGT, "vcgtz"); - case AArch64::BI__builtin_neon_vcltz_v: - case AArch64::BI__builtin_neon_vcltzq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT, - ICmpInst::ICMP_SLT, "vcltz"); + + // If there's an odd number of 64-bit lookup table, fill the high 64-bit + // of the 128-bit lookup table with zero. + if (PairPos == End) { + Value *ZeroTbl = ConstantAggregateZero::get(TblTy); + TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos], + ZeroTbl, SV, Name)); } + + Function *TblF; + TblOps.push_back(IndexOp); + TblF = CGF.CGM.getIntrinsic(IntID, ResTy); + + return CGF.EmitNeonCall(TblF, TblOps, Name); } Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { + unsigned HintID = static_cast<unsigned>(-1); + switch (BuiltinID) { + default: break; + case ARM::BI__builtin_arm_nop: + HintID = 0; + break; + case ARM::BI__builtin_arm_yield: + case ARM::BI__yield: + HintID = 1; + break; + case ARM::BI__builtin_arm_wfe: + case ARM::BI__wfe: + HintID = 2; + break; + case ARM::BI__builtin_arm_wfi: + case ARM::BI__wfi: + HintID = 3; + break; + case ARM::BI__builtin_arm_sev: + case ARM::BI__sev: + HintID = 4; + break; + case ARM::BI__builtin_arm_sevl: + case ARM::BI__sevl: + HintID = 5; + break; + } + + if (HintID != static_cast<unsigned>(-1)) { + Function *F = CGM.getIntrinsic(Intrinsic::arm_hint); + return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID)); + } + + if (BuiltinID == ARM::BI__builtin_arm_rbit) { + return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_rbit), + EmitScalarExpr(E->getArg(0)), + "rbit"); + } + if (BuiltinID == ARM::BI__clear_cache) { assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments"); const FunctionDecl *FD = E->getDirectCallee(); @@ -3938,9 +3093,23 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, } if (BuiltinID == ARM::BI__builtin_arm_ldrexd || - (BuiltinID == ARM::BI__builtin_arm_ldrex && - getContext().getTypeSize(E->getType()) == 64)) { - Function *F = CGM.getIntrinsic(Intrinsic::arm_ldrexd); + ((BuiltinID == ARM::BI__builtin_arm_ldrex || + BuiltinID == ARM::BI__builtin_arm_ldaex) && + getContext().getTypeSize(E->getType()) == 64) || + BuiltinID == ARM::BI__ldrexd) { + Function *F; + + switch (BuiltinID) { + default: llvm_unreachable("unexpected builtin"); + case ARM::BI__builtin_arm_ldaex: + F = CGM.getIntrinsic(Intrinsic::arm_ldaexd); + break; + case ARM::BI__builtin_arm_ldrexd: + case ARM::BI__builtin_arm_ldrex: + case ARM::BI__ldrexd: + F = CGM.getIntrinsic(Intrinsic::arm_ldrexd); + break; + } Value *LdPtr = EmitScalarExpr(E->getArg(0)); Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy), @@ -3957,7 +3126,8 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, return Builder.CreateBitCast(Val, ConvertType(E->getType())); } - if (BuiltinID == ARM::BI__builtin_arm_ldrex) { + if (BuiltinID == ARM::BI__builtin_arm_ldrex || + BuiltinID == ARM::BI__builtin_arm_ldaex) { Value *LoadAddr = EmitScalarExpr(E->getArg(0)); QualType Ty = E->getType(); @@ -3966,7 +3136,10 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, getContext().getTypeSize(Ty)); LoadAddr = Builder.CreateBitCast(LoadAddr, IntResTy->getPointerTo()); - Function *F = CGM.getIntrinsic(Intrinsic::arm_ldrex, LoadAddr->getType()); + Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_ldaex + ? Intrinsic::arm_ldaex + : Intrinsic::arm_ldrex, + LoadAddr->getType()); Value *Val = Builder.CreateCall(F, LoadAddr, "ldrex"); if (RealResTy->isPointerTy()) @@ -3978,9 +3151,12 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, } if (BuiltinID == ARM::BI__builtin_arm_strexd || - (BuiltinID == ARM::BI__builtin_arm_strex && + ((BuiltinID == ARM::BI__builtin_arm_stlex || + BuiltinID == ARM::BI__builtin_arm_strex) && getContext().getTypeSize(E->getArg(0)->getType()) == 64)) { - Function *F = CGM.getIntrinsic(Intrinsic::arm_strexd); + Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_stlex + ? Intrinsic::arm_stlexd + : Intrinsic::arm_strexd); llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, NULL); Value *Tmp = CreateMemTemp(E->getArg(0)->getType()); @@ -3996,7 +3172,8 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall3(F, Arg0, Arg1, StPtr, "strexd"); } - if (BuiltinID == ARM::BI__builtin_arm_strex) { + if (BuiltinID == ARM::BI__builtin_arm_strex || + BuiltinID == ARM::BI__builtin_arm_stlex) { Value *StoreVal = EmitScalarExpr(E->getArg(0)); Value *StoreAddr = EmitScalarExpr(E->getArg(1)); @@ -4012,7 +3189,10 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty); } - Function *F = CGM.getIntrinsic(Intrinsic::arm_strex, StoreAddr->getType()); + Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI__builtin_arm_stlex + ? Intrinsic::arm_stlex + : Intrinsic::arm_strex, + StoreAddr->getType()); return Builder.CreateCall2(F, StoreVal, StoreAddr, "strex"); } @@ -4021,11 +3201,6 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F); } - if (BuiltinID == ARM::BI__builtin_arm_sevl) { - Function *F = CGM.getIntrinsic(Intrinsic::arm_sevl); - return Builder.CreateCall(F); - } - // CRC32 Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic; switch (BuiltinID) { @@ -4070,32 +3245,32 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, } SmallVector<Value*, 4> Ops; - llvm::Value *Align = 0; + llvm::Value *Align = nullptr; for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) { if (i == 0) { switch (BuiltinID) { - case ARM::BI__builtin_neon_vld1_v: - case ARM::BI__builtin_neon_vld1q_v: - case ARM::BI__builtin_neon_vld1q_lane_v: - case ARM::BI__builtin_neon_vld1_lane_v: - case ARM::BI__builtin_neon_vld1_dup_v: - case ARM::BI__builtin_neon_vld1q_dup_v: - case ARM::BI__builtin_neon_vst1_v: - case ARM::BI__builtin_neon_vst1q_v: - case ARM::BI__builtin_neon_vst1q_lane_v: - case ARM::BI__builtin_neon_vst1_lane_v: - case ARM::BI__builtin_neon_vst2_v: - case ARM::BI__builtin_neon_vst2q_v: - case ARM::BI__builtin_neon_vst2_lane_v: - case ARM::BI__builtin_neon_vst2q_lane_v: - case ARM::BI__builtin_neon_vst3_v: - case ARM::BI__builtin_neon_vst3q_v: - case ARM::BI__builtin_neon_vst3_lane_v: - case ARM::BI__builtin_neon_vst3q_lane_v: - case ARM::BI__builtin_neon_vst4_v: - case ARM::BI__builtin_neon_vst4q_v: - case ARM::BI__builtin_neon_vst4_lane_v: - case ARM::BI__builtin_neon_vst4q_lane_v: + case NEON::BI__builtin_neon_vld1_v: + case NEON::BI__builtin_neon_vld1q_v: + case NEON::BI__builtin_neon_vld1q_lane_v: + case NEON::BI__builtin_neon_vld1_lane_v: + case NEON::BI__builtin_neon_vld1_dup_v: + case NEON::BI__builtin_neon_vld1q_dup_v: + case NEON::BI__builtin_neon_vst1_v: + case NEON::BI__builtin_neon_vst1q_v: + case NEON::BI__builtin_neon_vst1q_lane_v: + case NEON::BI__builtin_neon_vst1_lane_v: + case NEON::BI__builtin_neon_vst2_v: + case NEON::BI__builtin_neon_vst2q_v: + case NEON::BI__builtin_neon_vst2_lane_v: + case NEON::BI__builtin_neon_vst2q_lane_v: + case NEON::BI__builtin_neon_vst3_v: + case NEON::BI__builtin_neon_vst3q_v: + case NEON::BI__builtin_neon_vst3_lane_v: + case NEON::BI__builtin_neon_vst3q_lane_v: + case NEON::BI__builtin_neon_vst4_v: + case NEON::BI__builtin_neon_vst4q_v: + case NEON::BI__builtin_neon_vst4_lane_v: + case NEON::BI__builtin_neon_vst4q_lane_v: // Get the alignment for the argument in addition to the value; // we'll use it later. std::pair<llvm::Value*, unsigned> Src = @@ -4107,21 +3282,21 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, } if (i == 1) { switch (BuiltinID) { - case ARM::BI__builtin_neon_vld2_v: - case ARM::BI__builtin_neon_vld2q_v: - case ARM::BI__builtin_neon_vld3_v: - case ARM::BI__builtin_neon_vld3q_v: - case ARM::BI__builtin_neon_vld4_v: - case ARM::BI__builtin_neon_vld4q_v: - case ARM::BI__builtin_neon_vld2_lane_v: - case ARM::BI__builtin_neon_vld2q_lane_v: - case ARM::BI__builtin_neon_vld3_lane_v: - case ARM::BI__builtin_neon_vld3q_lane_v: - case ARM::BI__builtin_neon_vld4_lane_v: - case ARM::BI__builtin_neon_vld4q_lane_v: - case ARM::BI__builtin_neon_vld2_dup_v: - case ARM::BI__builtin_neon_vld3_dup_v: - case ARM::BI__builtin_neon_vld4_dup_v: + case NEON::BI__builtin_neon_vld2_v: + case NEON::BI__builtin_neon_vld2q_v: + case NEON::BI__builtin_neon_vld3_v: + case NEON::BI__builtin_neon_vld3q_v: + case NEON::BI__builtin_neon_vld4_v: + case NEON::BI__builtin_neon_vld4q_v: + case NEON::BI__builtin_neon_vld2_lane_v: + case NEON::BI__builtin_neon_vld2q_lane_v: + case NEON::BI__builtin_neon_vld3_lane_v: + case NEON::BI__builtin_neon_vld3q_lane_v: + case NEON::BI__builtin_neon_vld4_lane_v: + case NEON::BI__builtin_neon_vld4q_lane_v: + case NEON::BI__builtin_neon_vld2_dup_v: + case NEON::BI__builtin_neon_vld3_dup_v: + case NEON::BI__builtin_neon_vld4_dup_v: // Get the alignment for the argument in addition to the value; // we'll use it later. std::pair<llvm::Value*, unsigned> Src = @@ -4134,41 +3309,59 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, Ops.push_back(EmitScalarExpr(E->getArg(i))); } - // vget_lane and vset_lane are not overloaded and do not have an extra - // argument that specifies the vector type. switch (BuiltinID) { default: break; - case ARM::BI__builtin_neon_vget_lane_i8: - case ARM::BI__builtin_neon_vget_lane_i16: - case ARM::BI__builtin_neon_vget_lane_i32: - case ARM::BI__builtin_neon_vget_lane_i64: - case ARM::BI__builtin_neon_vget_lane_f32: - case ARM::BI__builtin_neon_vgetq_lane_i8: - case ARM::BI__builtin_neon_vgetq_lane_i16: - case ARM::BI__builtin_neon_vgetq_lane_i32: - case ARM::BI__builtin_neon_vgetq_lane_i64: - case ARM::BI__builtin_neon_vgetq_lane_f32: + // vget_lane and vset_lane are not overloaded and do not have an extra + // argument that specifies the vector type. + case NEON::BI__builtin_neon_vget_lane_i8: + case NEON::BI__builtin_neon_vget_lane_i16: + case NEON::BI__builtin_neon_vget_lane_i32: + case NEON::BI__builtin_neon_vget_lane_i64: + case NEON::BI__builtin_neon_vget_lane_f32: + case NEON::BI__builtin_neon_vgetq_lane_i8: + case NEON::BI__builtin_neon_vgetq_lane_i16: + case NEON::BI__builtin_neon_vgetq_lane_i32: + case NEON::BI__builtin_neon_vgetq_lane_i64: + case NEON::BI__builtin_neon_vgetq_lane_f32: return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), "vget_lane"); - case ARM::BI__builtin_neon_vset_lane_i8: - case ARM::BI__builtin_neon_vset_lane_i16: - case ARM::BI__builtin_neon_vset_lane_i32: - case ARM::BI__builtin_neon_vset_lane_i64: - case ARM::BI__builtin_neon_vset_lane_f32: - case ARM::BI__builtin_neon_vsetq_lane_i8: - case ARM::BI__builtin_neon_vsetq_lane_i16: - case ARM::BI__builtin_neon_vsetq_lane_i32: - case ARM::BI__builtin_neon_vsetq_lane_i64: - case ARM::BI__builtin_neon_vsetq_lane_f32: + case NEON::BI__builtin_neon_vset_lane_i8: + case NEON::BI__builtin_neon_vset_lane_i16: + case NEON::BI__builtin_neon_vset_lane_i32: + case NEON::BI__builtin_neon_vset_lane_i64: + case NEON::BI__builtin_neon_vset_lane_f32: + case NEON::BI__builtin_neon_vsetq_lane_i8: + case NEON::BI__builtin_neon_vsetq_lane_i16: + case NEON::BI__builtin_neon_vsetq_lane_i32: + case NEON::BI__builtin_neon_vsetq_lane_i64: + case NEON::BI__builtin_neon_vsetq_lane_f32: Ops.push_back(EmitScalarExpr(E->getArg(2))); return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); + + // Non-polymorphic crypto instructions also not overloaded + case NEON::BI__builtin_neon_vsha1h_u32: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops, + "vsha1h"); + case NEON::BI__builtin_neon_vsha1cq_u32: + Ops.push_back(EmitScalarExpr(E->getArg(2))); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops, + "vsha1h"); + case NEON::BI__builtin_neon_vsha1pq_u32: + Ops.push_back(EmitScalarExpr(E->getArg(2))); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops, + "vsha1h"); + case NEON::BI__builtin_neon_vsha1mq_u32: + Ops.push_back(EmitScalarExpr(E->getArg(2))); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops, + "vsha1h"); } // Get the last argument, which specifies the vector type. llvm::APSInt Result; const Expr *Arg = E->getArg(E->getNumArgs()-1); if (!Arg->isIntegerConstantExpr(Result, getContext())) - return 0; + return nullptr; if (BuiltinID == ARM::BI__builtin_arm_vcvtr_f || BuiltinID == ARM::BI__builtin_arm_vcvtr_d) { @@ -4191,166 +3384,27 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, // Determine the type of this overloaded NEON intrinsic. NeonTypeFlags Type(Result.getZExtValue()); bool usgn = Type.isUnsigned(); - bool quad = Type.isQuad(); bool rightShift = false; llvm::VectorType *VTy = GetNeonType(this, Type); llvm::Type *Ty = VTy; if (!Ty) - return 0; + return nullptr; + + // Many NEON builtins have identical semantics and uses in ARM and + // AArch64. Emit these in a single function. + ArrayRef<NeonIntrinsicInfo> IntrinsicMap(ARMSIMDIntrinsicMap); + const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap( + IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted); + if (Builtin) + return EmitCommonNeonBuiltinExpr( + Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic, + Builtin->NameHint, Builtin->TypeModifier, E, Ops, Align); unsigned Int; switch (BuiltinID) { - default: return 0; - case ARM::BI__builtin_neon_vbsl_v: - case ARM::BI__builtin_neon_vbslq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vbsl, Ty), - Ops, "vbsl"); - case ARM::BI__builtin_neon_vabd_v: - case ARM::BI__builtin_neon_vabdq_v: - Int = usgn ? Intrinsic::arm_neon_vabdu : Intrinsic::arm_neon_vabds; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd"); - case ARM::BI__builtin_neon_vabs_v: - case ARM::BI__builtin_neon_vabsq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vabs, Ty), - Ops, "vabs"); - case ARM::BI__builtin_neon_vaddhn_v: { - llvm::VectorType *SrcTy = - llvm::VectorType::getExtendedElementVectorType(VTy); - - // %sum = add <4 x i32> %lhs, %rhs - Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); - Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy); - Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn"); - - // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> - Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(), - SrcTy->getScalarSizeInBits() / 2); - ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt); - Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn"); - - // %res = trunc <4 x i32> %high to <4 x i16> - return Builder.CreateTrunc(Ops[0], VTy, "vaddhn"); - } - case ARM::BI__builtin_neon_vcale_v: - std::swap(Ops[0], Ops[1]); - case ARM::BI__builtin_neon_vcage_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacged); - return EmitNeonCall(F, Ops, "vcage"); - } - case ARM::BI__builtin_neon_vcaleq_v: - std::swap(Ops[0], Ops[1]); - case ARM::BI__builtin_neon_vcageq_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgeq); - return EmitNeonCall(F, Ops, "vcage"); - } - case ARM::BI__builtin_neon_vcalt_v: - std::swap(Ops[0], Ops[1]); - case ARM::BI__builtin_neon_vcagt_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgtd); - return EmitNeonCall(F, Ops, "vcagt"); - } - case ARM::BI__builtin_neon_vcaltq_v: - std::swap(Ops[0], Ops[1]); - case ARM::BI__builtin_neon_vcagtq_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgtq); - return EmitNeonCall(F, Ops, "vcagt"); - } - case ARM::BI__builtin_neon_vcls_v: - case ARM::BI__builtin_neon_vclsq_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vcls, Ty); - return EmitNeonCall(F, Ops, "vcls"); - } - case ARM::BI__builtin_neon_vclz_v: - case ARM::BI__builtin_neon_vclzq_v: { - // Generate target-independent intrinsic; also need to add second argument - // for whether or not clz of zero is undefined; on ARM it isn't. - Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ty); - Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef())); - return EmitNeonCall(F, Ops, "vclz"); - } - case ARM::BI__builtin_neon_vcnt_v: - case ARM::BI__builtin_neon_vcntq_v: { - // generate target-independent intrinsic - Function *F = CGM.getIntrinsic(Intrinsic::ctpop, Ty); - return EmitNeonCall(F, Ops, "vctpop"); - } - case ARM::BI__builtin_neon_vcvt_f16_v: { - assert(Type.getEltType() == NeonTypeFlags::Float16 && !quad && - "unexpected vcvt_f16_v builtin"); - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vcvtfp2hf); - return EmitNeonCall(F, Ops, "vcvt"); - } - case ARM::BI__builtin_neon_vcvt_f32_f16: { - assert(Type.getEltType() == NeonTypeFlags::Float16 && !quad && - "unexpected vcvt_f32_f16 builtin"); - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vcvthf2fp); - return EmitNeonCall(F, Ops, "vcvt"); - } - case ARM::BI__builtin_neon_vcvt_f32_v: - case ARM::BI__builtin_neon_vcvtq_f32_v: - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, quad)); - return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") - : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); - case ARM::BI__builtin_neon_vcvt_s32_v: - case ARM::BI__builtin_neon_vcvt_u32_v: - case ARM::BI__builtin_neon_vcvtq_s32_v: - case ARM::BI__builtin_neon_vcvtq_u32_v: { - llvm::Type *FloatTy = - GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, quad)); - Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy); - return usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt") - : Builder.CreateFPToSI(Ops[0], Ty, "vcvt"); - } - case ARM::BI__builtin_neon_vcvt_n_f32_v: - case ARM::BI__builtin_neon_vcvtq_n_f32_v: { - llvm::Type *FloatTy = - GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, quad)); - llvm::Type *Tys[2] = { FloatTy, Ty }; - Int = usgn ? Intrinsic::arm_neon_vcvtfxu2fp - : Intrinsic::arm_neon_vcvtfxs2fp; - Function *F = CGM.getIntrinsic(Int, Tys); - return EmitNeonCall(F, Ops, "vcvt_n"); - } - case ARM::BI__builtin_neon_vcvt_n_s32_v: - case ARM::BI__builtin_neon_vcvt_n_u32_v: - case ARM::BI__builtin_neon_vcvtq_n_s32_v: - case ARM::BI__builtin_neon_vcvtq_n_u32_v: { - llvm::Type *FloatTy = - GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, quad)); - llvm::Type *Tys[2] = { Ty, FloatTy }; - Int = usgn ? Intrinsic::arm_neon_vcvtfp2fxu - : Intrinsic::arm_neon_vcvtfp2fxs; - Function *F = CGM.getIntrinsic(Int, Tys); - return EmitNeonCall(F, Ops, "vcvt_n"); - } - case ARM::BI__builtin_neon_vext_v: - case ARM::BI__builtin_neon_vextq_v: { - int CV = cast<ConstantInt>(Ops[2])->getSExtValue(); - SmallVector<Constant*, 16> Indices; - for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) - Indices.push_back(ConstantInt::get(Int32Ty, i+CV)); - - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - Value *SV = llvm::ConstantVector::get(Indices); - return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vext"); - } - case ARM::BI__builtin_neon_vhadd_v: - case ARM::BI__builtin_neon_vhaddq_v: - Int = usgn ? Intrinsic::arm_neon_vhaddu : Intrinsic::arm_neon_vhadds; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vhadd"); - case ARM::BI__builtin_neon_vhsub_v: - case ARM::BI__builtin_neon_vhsubq_v: - Int = usgn ? Intrinsic::arm_neon_vhsubu : Intrinsic::arm_neon_vhsubs; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vhsub"); - case ARM::BI__builtin_neon_vld1_v: - case ARM::BI__builtin_neon_vld1q_v: - Ops.push_back(Align); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Ty), - Ops, "vld1"); - case ARM::BI__builtin_neon_vld1q_lane_v: + default: return nullptr; + case NEON::BI__builtin_neon_vld1q_lane_v: // Handle 64-bit integer elements as a special case. Use shuffles of // one-element vectors to avoid poor code for i64 in the backend. if (VTy->getElementType()->isIntegerTy(64)) { @@ -4371,7 +3425,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, return Builder.CreateShuffleVector(Ops[1], Ld, SV, "vld1q_lane"); } // fall through - case ARM::BI__builtin_neon_vld1_lane_v: { + case NEON::BI__builtin_neon_vld1_lane_v: { Ops[1] = Builder.CreateBitCast(Ops[1], Ty); Ty = llvm::PointerType::getUnqual(VTy->getElementType()); Ops[0] = Builder.CreateBitCast(Ops[0], Ty); @@ -4379,90 +3433,19 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue()); return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane"); } - case ARM::BI__builtin_neon_vld1_dup_v: - case ARM::BI__builtin_neon_vld1q_dup_v: { - Value *V = UndefValue::get(Ty); - Ty = llvm::PointerType::getUnqual(VTy->getElementType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - LoadInst *Ld = Builder.CreateLoad(Ops[0]); - Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue()); - llvm::Constant *CI = ConstantInt::get(Int32Ty, 0); - Ops[0] = Builder.CreateInsertElement(V, Ld, CI); - return EmitNeonSplat(Ops[0], CI); - } - case ARM::BI__builtin_neon_vld2_v: - case ARM::BI__builtin_neon_vld2q_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld2, Ty); - Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld2"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateStore(Ops[1], Ops[0]); - } - case ARM::BI__builtin_neon_vld3_v: - case ARM::BI__builtin_neon_vld3q_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld3, Ty); - Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld3"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateStore(Ops[1], Ops[0]); - } - case ARM::BI__builtin_neon_vld4_v: - case ARM::BI__builtin_neon_vld4q_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld4, Ty); - Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld4"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateStore(Ops[1], Ops[0]); - } - case ARM::BI__builtin_neon_vld2_lane_v: - case ARM::BI__builtin_neon_vld2q_lane_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld2lane, Ty); - Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - Ops[3] = Builder.CreateBitCast(Ops[3], Ty); - Ops.push_back(Align); - Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld2_lane"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateStore(Ops[1], Ops[0]); - } - case ARM::BI__builtin_neon_vld3_lane_v: - case ARM::BI__builtin_neon_vld3q_lane_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld3lane, Ty); - Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - Ops[3] = Builder.CreateBitCast(Ops[3], Ty); - Ops[4] = Builder.CreateBitCast(Ops[4], Ty); - Ops.push_back(Align); - Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld3_lane"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateStore(Ops[1], Ops[0]); - } - case ARM::BI__builtin_neon_vld4_lane_v: - case ARM::BI__builtin_neon_vld4q_lane_v: { - Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld4lane, Ty); - Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - Ops[3] = Builder.CreateBitCast(Ops[3], Ty); - Ops[4] = Builder.CreateBitCast(Ops[4], Ty); - Ops[5] = Builder.CreateBitCast(Ops[5], Ty); - Ops.push_back(Align); - Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld3_lane"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateStore(Ops[1], Ops[0]); - } - case ARM::BI__builtin_neon_vld2_dup_v: - case ARM::BI__builtin_neon_vld3_dup_v: - case ARM::BI__builtin_neon_vld4_dup_v: { + case NEON::BI__builtin_neon_vld2_dup_v: + case NEON::BI__builtin_neon_vld3_dup_v: + case NEON::BI__builtin_neon_vld4_dup_v: { // Handle 64-bit elements as a special-case. There is no "dup" needed. if (VTy->getElementType()->getPrimitiveSizeInBits() == 64) { switch (BuiltinID) { - case ARM::BI__builtin_neon_vld2_dup_v: + case NEON::BI__builtin_neon_vld2_dup_v: Int = Intrinsic::arm_neon_vld2; break; - case ARM::BI__builtin_neon_vld3_dup_v: + case NEON::BI__builtin_neon_vld3_dup_v: Int = Intrinsic::arm_neon_vld3; break; - case ARM::BI__builtin_neon_vld4_dup_v: + case NEON::BI__builtin_neon_vld4_dup_v: Int = Intrinsic::arm_neon_vld4; break; default: llvm_unreachable("unknown vld_dup intrinsic?"); @@ -4474,13 +3457,13 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, return Builder.CreateStore(Ops[1], Ops[0]); } switch (BuiltinID) { - case ARM::BI__builtin_neon_vld2_dup_v: + case NEON::BI__builtin_neon_vld2_dup_v: Int = Intrinsic::arm_neon_vld2lane; break; - case ARM::BI__builtin_neon_vld3_dup_v: + case NEON::BI__builtin_neon_vld3_dup_v: Int = Intrinsic::arm_neon_vld3lane; break; - case ARM::BI__builtin_neon_vld4_dup_v: + case NEON::BI__builtin_neon_vld4_dup_v: Int = Intrinsic::arm_neon_vld4lane; break; default: llvm_unreachable("unknown vld_dup intrinsic?"); @@ -4509,251 +3492,58 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, Ops[0] = Builder.CreateBitCast(Ops[0], Ty); return Builder.CreateStore(Ops[1], Ops[0]); } - case ARM::BI__builtin_neon_vmax_v: - case ARM::BI__builtin_neon_vmaxq_v: - Int = usgn ? Intrinsic::arm_neon_vmaxu : Intrinsic::arm_neon_vmaxs; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax"); - case ARM::BI__builtin_neon_vmin_v: - case ARM::BI__builtin_neon_vminq_v: - Int = usgn ? Intrinsic::arm_neon_vminu : Intrinsic::arm_neon_vmins; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin"); - case ARM::BI__builtin_neon_vmovl_v: { - llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy); - Ops[0] = Builder.CreateBitCast(Ops[0], DTy); - if (usgn) - return Builder.CreateZExt(Ops[0], Ty, "vmovl"); - return Builder.CreateSExt(Ops[0], Ty, "vmovl"); - } - case ARM::BI__builtin_neon_vmovn_v: { - llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy); - Ops[0] = Builder.CreateBitCast(Ops[0], QTy); - return Builder.CreateTrunc(Ops[0], Ty, "vmovn"); - } - case ARM::BI__builtin_neon_vmul_v: - case ARM::BI__builtin_neon_vmulq_v: - assert(Type.isPoly() && "vmul builtin only supported for polynomial types"); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vmulp, Ty), - Ops, "vmul"); - case ARM::BI__builtin_neon_vmull_v: - // FIXME: the integer vmull operations could be emitted in terms of pure - // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of - // hoisting the exts outside loops. Until global ISel comes along that can - // see through such movement this leads to bad CodeGen. So we need an - // intrinsic for now. - Int = usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls; - Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull"); - case ARM::BI__builtin_neon_vfma_v: - case ARM::BI__builtin_neon_vfmaq_v: { - Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - - // NEON intrinsic puts accumulator first, unlike the LLVM fma. - return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]); - } - case ARM::BI__builtin_neon_vpadal_v: - case ARM::BI__builtin_neon_vpadalq_v: { - Int = usgn ? Intrinsic::arm_neon_vpadalu : Intrinsic::arm_neon_vpadals; - // The source operand type has twice as many elements of half the size. - unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits(); - llvm::Type *EltTy = - llvm::IntegerType::get(getLLVMContext(), EltBits / 2); - llvm::Type *NarrowTy = - llvm::VectorType::get(EltTy, VTy->getNumElements() * 2); - llvm::Type *Tys[2] = { Ty, NarrowTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpadal"); - } - case ARM::BI__builtin_neon_vpadd_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vpadd, Ty), - Ops, "vpadd"); - case ARM::BI__builtin_neon_vpaddl_v: - case ARM::BI__builtin_neon_vpaddlq_v: { - Int = usgn ? Intrinsic::arm_neon_vpaddlu : Intrinsic::arm_neon_vpaddls; - // The source operand type has twice as many elements of half the size. - unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits(); - llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2); - llvm::Type *NarrowTy = - llvm::VectorType::get(EltTy, VTy->getNumElements() * 2); - llvm::Type *Tys[2] = { Ty, NarrowTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl"); - } - case ARM::BI__builtin_neon_vpmax_v: - Int = usgn ? Intrinsic::arm_neon_vpmaxu : Intrinsic::arm_neon_vpmaxs; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax"); - case ARM::BI__builtin_neon_vpmin_v: - Int = usgn ? Intrinsic::arm_neon_vpminu : Intrinsic::arm_neon_vpmins; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin"); - case ARM::BI__builtin_neon_vqabs_v: - case ARM::BI__builtin_neon_vqabsq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqabs, Ty), - Ops, "vqabs"); - case ARM::BI__builtin_neon_vqadd_v: - case ARM::BI__builtin_neon_vqaddq_v: - Int = usgn ? Intrinsic::arm_neon_vqaddu : Intrinsic::arm_neon_vqadds; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqadd"); - case ARM::BI__builtin_neon_vqdmlal_v: { - SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end()); - Value *Mul = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, Ty), - MulOps, "vqdmlal"); - - SmallVector<Value *, 2> AddOps; - AddOps.push_back(Ops[0]); - AddOps.push_back(Mul); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqadds, Ty), - AddOps, "vqdmlal"); - } - case ARM::BI__builtin_neon_vqdmlsl_v: { - SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end()); - Value *Mul = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, Ty), - MulOps, "vqdmlsl"); - - SmallVector<Value *, 2> SubOps; - SubOps.push_back(Ops[0]); - SubOps.push_back(Mul); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqsubs, Ty), - SubOps, "vqdmlsl"); - } - case ARM::BI__builtin_neon_vqdmulh_v: - case ARM::BI__builtin_neon_vqdmulhq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmulh, Ty), - Ops, "vqdmulh"); - case ARM::BI__builtin_neon_vqdmull_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, Ty), - Ops, "vqdmull"); - case ARM::BI__builtin_neon_vqmovn_v: - Int = usgn ? Intrinsic::arm_neon_vqmovnu : Intrinsic::arm_neon_vqmovns; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqmovn"); - case ARM::BI__builtin_neon_vqmovun_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqmovnsu, Ty), - Ops, "vqdmull"); - case ARM::BI__builtin_neon_vqneg_v: - case ARM::BI__builtin_neon_vqnegq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqneg, Ty), - Ops, "vqneg"); - case ARM::BI__builtin_neon_vqrdmulh_v: - case ARM::BI__builtin_neon_vqrdmulhq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrdmulh, Ty), - Ops, "vqrdmulh"); - case ARM::BI__builtin_neon_vqrshl_v: - case ARM::BI__builtin_neon_vqrshlq_v: - Int = usgn ? Intrinsic::arm_neon_vqrshiftu : Intrinsic::arm_neon_vqrshifts; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshl"); - case ARM::BI__builtin_neon_vqrshrn_n_v: + case NEON::BI__builtin_neon_vqrshrn_n_v: Int = usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n", 1, true); - case ARM::BI__builtin_neon_vqrshrun_n_v: + case NEON::BI__builtin_neon_vqrshrun_n_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty), Ops, "vqrshrun_n", 1, true); - case ARM::BI__builtin_neon_vqshl_v: - case ARM::BI__builtin_neon_vqshlq_v: - Int = usgn ? Intrinsic::arm_neon_vqshiftu : Intrinsic::arm_neon_vqshifts; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl"); - case ARM::BI__builtin_neon_vqshl_n_v: - case ARM::BI__builtin_neon_vqshlq_n_v: - Int = usgn ? Intrinsic::arm_neon_vqshiftu : Intrinsic::arm_neon_vqshifts; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n", - 1, false); - case ARM::BI__builtin_neon_vqshlu_n_v: - case ARM::BI__builtin_neon_vqshluq_n_v: + case NEON::BI__builtin_neon_vqshlu_n_v: + case NEON::BI__builtin_neon_vqshluq_n_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftsu, Ty), Ops, "vqshlu", 1, false); - case ARM::BI__builtin_neon_vqshrn_n_v: + case NEON::BI__builtin_neon_vqshrn_n_v: Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n", 1, true); - case ARM::BI__builtin_neon_vqshrun_n_v: + case NEON::BI__builtin_neon_vqshrun_n_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty), Ops, "vqshrun_n", 1, true); - case ARM::BI__builtin_neon_vqsub_v: - case ARM::BI__builtin_neon_vqsubq_v: - Int = usgn ? Intrinsic::arm_neon_vqsubu : Intrinsic::arm_neon_vqsubs; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqsub"); - case ARM::BI__builtin_neon_vraddhn_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vraddhn, Ty), - Ops, "vraddhn"); - case ARM::BI__builtin_neon_vrecpe_v: - case ARM::BI__builtin_neon_vrecpeq_v: + case NEON::BI__builtin_neon_vrecpe_v: + case NEON::BI__builtin_neon_vrecpeq_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty), Ops, "vrecpe"); - case ARM::BI__builtin_neon_vrecps_v: - case ARM::BI__builtin_neon_vrecpsq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecps, Ty), - Ops, "vrecps"); - case ARM::BI__builtin_neon_vrhadd_v: - case ARM::BI__builtin_neon_vrhaddq_v: - Int = usgn ? Intrinsic::arm_neon_vrhaddu : Intrinsic::arm_neon_vrhadds; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrhadd"); - case ARM::BI__builtin_neon_vrshl_v: - case ARM::BI__builtin_neon_vrshlq_v: - Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshl"); - case ARM::BI__builtin_neon_vrshrn_n_v: + case NEON::BI__builtin_neon_vrshrn_n_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty), Ops, "vrshrn_n", 1, true); - case ARM::BI__builtin_neon_vrshr_n_v: - case ARM::BI__builtin_neon_vrshrq_n_v: + case NEON::BI__builtin_neon_vrshr_n_v: + case NEON::BI__builtin_neon_vrshrq_n_v: Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n", 1, true); - case ARM::BI__builtin_neon_vrsqrte_v: - case ARM::BI__builtin_neon_vrsqrteq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrsqrte, Ty), - Ops, "vrsqrte"); - case ARM::BI__builtin_neon_vrsqrts_v: - case ARM::BI__builtin_neon_vrsqrtsq_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrsqrts, Ty), - Ops, "vrsqrts"); - case ARM::BI__builtin_neon_vrsra_n_v: - case ARM::BI__builtin_neon_vrsraq_n_v: + case NEON::BI__builtin_neon_vrsra_n_v: + case NEON::BI__builtin_neon_vrsraq_n_v: Ops[0] = Builder.CreateBitCast(Ops[0], Ty); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true); Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts; Ops[1] = Builder.CreateCall2(CGM.getIntrinsic(Int, Ty), Ops[1], Ops[2]); return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n"); - case ARM::BI__builtin_neon_vrsubhn_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrsubhn, Ty), - Ops, "vrsubhn"); - case ARM::BI__builtin_neon_vshl_v: - case ARM::BI__builtin_neon_vshlq_v: - Int = usgn ? Intrinsic::arm_neon_vshiftu : Intrinsic::arm_neon_vshifts; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vshl"); - case ARM::BI__builtin_neon_vshll_n_v: - Int = usgn ? Intrinsic::arm_neon_vshiftlu : Intrinsic::arm_neon_vshiftls; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vshll", 1); - case ARM::BI__builtin_neon_vshl_n_v: - case ARM::BI__builtin_neon_vshlq_n_v: - Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false); - return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1], - "vshl_n"); - case ARM::BI__builtin_neon_vshrn_n_v: - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftn, Ty), - Ops, "vshrn_n", 1, true); - case ARM::BI__builtin_neon_vshr_n_v: - case ARM::BI__builtin_neon_vshrq_n_v: - return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, usgn, "vshr_n"); - case ARM::BI__builtin_neon_vsri_n_v: - case ARM::BI__builtin_neon_vsriq_n_v: + case NEON::BI__builtin_neon_vsri_n_v: + case NEON::BI__builtin_neon_vsriq_n_v: rightShift = true; - case ARM::BI__builtin_neon_vsli_n_v: - case ARM::BI__builtin_neon_vsliq_n_v: + case NEON::BI__builtin_neon_vsli_n_v: + case NEON::BI__builtin_neon_vsliq_n_v: Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift); return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty), Ops, "vsli_n"); - case ARM::BI__builtin_neon_vsra_n_v: - case ARM::BI__builtin_neon_vsraq_n_v: + case NEON::BI__builtin_neon_vsra_n_v: + case NEON::BI__builtin_neon_vsraq_n_v: Ops[0] = Builder.CreateBitCast(Ops[0], Ty); Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n"); return Builder.CreateAdd(Ops[0], Ops[1]); - case ARM::BI__builtin_neon_vst1_v: - case ARM::BI__builtin_neon_vst1q_v: - Ops.push_back(Align); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1, Ty), - Ops, ""); - case ARM::BI__builtin_neon_vst1q_lane_v: + case NEON::BI__builtin_neon_vst1q_lane_v: // Handle 64-bit integer elements as a special case. Use a shuffle to get // a one-element vector and avoid poor code for i64 in the backend. if (VTy->getElementType()->isIntegerTy(64)) { @@ -4765,7 +3555,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, Ops[1]->getType()), Ops); } // fall through - case ARM::BI__builtin_neon_vst1_lane_v: { + case NEON::BI__builtin_neon_vst1_lane_v: { Ops[1] = Builder.CreateBitCast(Ops[1], Ty); Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]); Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); @@ -4774,99 +3564,1984 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, St->setAlignment(cast<ConstantInt>(Align)->getZExtValue()); return St; } - case ARM::BI__builtin_neon_vst2_v: - case ARM::BI__builtin_neon_vst2q_v: - Ops.push_back(Align); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst2, Ty), - Ops, ""); - case ARM::BI__builtin_neon_vst2_lane_v: - case ARM::BI__builtin_neon_vst2q_lane_v: - Ops.push_back(Align); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst2lane, Ty), - Ops, ""); - case ARM::BI__builtin_neon_vst3_v: - case ARM::BI__builtin_neon_vst3q_v: - Ops.push_back(Align); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst3, Ty), - Ops, ""); - case ARM::BI__builtin_neon_vst3_lane_v: - case ARM::BI__builtin_neon_vst3q_lane_v: - Ops.push_back(Align); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst3lane, Ty), - Ops, ""); - case ARM::BI__builtin_neon_vst4_v: - case ARM::BI__builtin_neon_vst4q_v: - Ops.push_back(Align); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst4, Ty), - Ops, ""); - case ARM::BI__builtin_neon_vst4_lane_v: - case ARM::BI__builtin_neon_vst4q_lane_v: - Ops.push_back(Align); - return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst4lane, Ty), - Ops, ""); - case ARM::BI__builtin_neon_vsubhn_v: { - llvm::VectorType *SrcTy = - llvm::VectorType::getExtendedElementVectorType(VTy); - - // %sum = add <4 x i32> %lhs, %rhs - Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); - Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy); - Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn"); - - // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> - Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(), - SrcTy->getScalarSizeInBits() / 2); - ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt); - Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn"); - - // %res = trunc <4 x i32> %high to <4 x i16> - return Builder.CreateTrunc(Ops[0], VTy, "vsubhn"); - } - case ARM::BI__builtin_neon_vtbl1_v: + case NEON::BI__builtin_neon_vtbl1_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1), Ops, "vtbl1"); - case ARM::BI__builtin_neon_vtbl2_v: + case NEON::BI__builtin_neon_vtbl2_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2), Ops, "vtbl2"); - case ARM::BI__builtin_neon_vtbl3_v: + case NEON::BI__builtin_neon_vtbl3_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3), Ops, "vtbl3"); - case ARM::BI__builtin_neon_vtbl4_v: + case NEON::BI__builtin_neon_vtbl4_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4), Ops, "vtbl4"); - case ARM::BI__builtin_neon_vtbx1_v: + case NEON::BI__builtin_neon_vtbx1_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1), Ops, "vtbx1"); - case ARM::BI__builtin_neon_vtbx2_v: + case NEON::BI__builtin_neon_vtbx2_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2), Ops, "vtbx2"); - case ARM::BI__builtin_neon_vtbx3_v: + case NEON::BI__builtin_neon_vtbx3_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3), Ops, "vtbx3"); - case ARM::BI__builtin_neon_vtbx4_v: + case NEON::BI__builtin_neon_vtbx4_v: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4), Ops, "vtbx4"); - case ARM::BI__builtin_neon_vtst_v: - case ARM::BI__builtin_neon_vtstq_v: { + } +} + +static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID, + const CallExpr *E, + SmallVectorImpl<Value *> &Ops) { + unsigned int Int = 0; + const char *s = nullptr; + + switch (BuiltinID) { + default: + return nullptr; + case NEON::BI__builtin_neon_vtbl1_v: + case NEON::BI__builtin_neon_vqtbl1_v: + case NEON::BI__builtin_neon_vqtbl1q_v: + case NEON::BI__builtin_neon_vtbl2_v: + case NEON::BI__builtin_neon_vqtbl2_v: + case NEON::BI__builtin_neon_vqtbl2q_v: + case NEON::BI__builtin_neon_vtbl3_v: + case NEON::BI__builtin_neon_vqtbl3_v: + case NEON::BI__builtin_neon_vqtbl3q_v: + case NEON::BI__builtin_neon_vtbl4_v: + case NEON::BI__builtin_neon_vqtbl4_v: + case NEON::BI__builtin_neon_vqtbl4q_v: + break; + case NEON::BI__builtin_neon_vtbx1_v: + case NEON::BI__builtin_neon_vqtbx1_v: + case NEON::BI__builtin_neon_vqtbx1q_v: + case NEON::BI__builtin_neon_vtbx2_v: + case NEON::BI__builtin_neon_vqtbx2_v: + case NEON::BI__builtin_neon_vqtbx2q_v: + case NEON::BI__builtin_neon_vtbx3_v: + case NEON::BI__builtin_neon_vqtbx3_v: + case NEON::BI__builtin_neon_vqtbx3q_v: + case NEON::BI__builtin_neon_vtbx4_v: + case NEON::BI__builtin_neon_vqtbx4_v: + case NEON::BI__builtin_neon_vqtbx4q_v: + break; + } + + assert(E->getNumArgs() >= 3); + + // Get the last argument, which specifies the vector type. + llvm::APSInt Result; + const Expr *Arg = E->getArg(E->getNumArgs() - 1); + if (!Arg->isIntegerConstantExpr(Result, CGF.getContext())) + return nullptr; + + // Determine the type of this overloaded NEON intrinsic. + NeonTypeFlags Type(Result.getZExtValue()); + llvm::VectorType *VTy = GetNeonType(&CGF, Type); + llvm::Type *Ty = VTy; + if (!Ty) + return nullptr; + + unsigned nElts = VTy->getNumElements(); + + CodeGen::CGBuilderTy &Builder = CGF.Builder; + + // AArch64 scalar builtins are not overloaded, they do not have an extra + // argument that specifies the vector type, need to handle each case. + SmallVector<Value *, 2> TblOps; + switch (BuiltinID) { + case NEON::BI__builtin_neon_vtbl1_v: { + TblOps.push_back(Ops[0]); + return packTBLDVectorList(CGF, TblOps, nullptr, Ops[1], Ty, + Intrinsic::aarch64_neon_tbl1, "vtbl1"); + } + case NEON::BI__builtin_neon_vtbl2_v: { + TblOps.push_back(Ops[0]); + TblOps.push_back(Ops[1]); + return packTBLDVectorList(CGF, TblOps, nullptr, Ops[2], Ty, + Intrinsic::aarch64_neon_tbl1, "vtbl1"); + } + case NEON::BI__builtin_neon_vtbl3_v: { + TblOps.push_back(Ops[0]); + TblOps.push_back(Ops[1]); + TblOps.push_back(Ops[2]); + return packTBLDVectorList(CGF, TblOps, nullptr, Ops[3], Ty, + Intrinsic::aarch64_neon_tbl2, "vtbl2"); + } + case NEON::BI__builtin_neon_vtbl4_v: { + TblOps.push_back(Ops[0]); + TblOps.push_back(Ops[1]); + TblOps.push_back(Ops[2]); + TblOps.push_back(Ops[3]); + return packTBLDVectorList(CGF, TblOps, nullptr, Ops[4], Ty, + Intrinsic::aarch64_neon_tbl2, "vtbl2"); + } + case NEON::BI__builtin_neon_vtbx1_v: { + TblOps.push_back(Ops[1]); + Value *TblRes = packTBLDVectorList(CGF, TblOps, nullptr, Ops[2], Ty, + Intrinsic::aarch64_neon_tbl1, "vtbl1"); + + llvm::Constant *Eight = ConstantInt::get(VTy->getElementType(), 8); + Value* EightV = llvm::ConstantVector::getSplat(nElts, Eight); + Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV); + CmpRes = Builder.CreateSExt(CmpRes, Ty); + + Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]); + Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes); + return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx"); + } + case NEON::BI__builtin_neon_vtbx2_v: { + TblOps.push_back(Ops[1]); + TblOps.push_back(Ops[2]); + return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[3], Ty, + Intrinsic::aarch64_neon_tbx1, "vtbx1"); + } + case NEON::BI__builtin_neon_vtbx3_v: { + TblOps.push_back(Ops[1]); + TblOps.push_back(Ops[2]); + TblOps.push_back(Ops[3]); + Value *TblRes = packTBLDVectorList(CGF, TblOps, nullptr, Ops[4], Ty, + Intrinsic::aarch64_neon_tbl2, "vtbl2"); + + llvm::Constant *TwentyFour = ConstantInt::get(VTy->getElementType(), 24); + Value* TwentyFourV = llvm::ConstantVector::getSplat(nElts, TwentyFour); + Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4], + TwentyFourV); + CmpRes = Builder.CreateSExt(CmpRes, Ty); + + Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]); + Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes); + return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx"); + } + case NEON::BI__builtin_neon_vtbx4_v: { + TblOps.push_back(Ops[1]); + TblOps.push_back(Ops[2]); + TblOps.push_back(Ops[3]); + TblOps.push_back(Ops[4]); + return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[5], Ty, + Intrinsic::aarch64_neon_tbx2, "vtbx2"); + } + case NEON::BI__builtin_neon_vqtbl1_v: + case NEON::BI__builtin_neon_vqtbl1q_v: + Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break; + case NEON::BI__builtin_neon_vqtbl2_v: + case NEON::BI__builtin_neon_vqtbl2q_v: { + Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break; + case NEON::BI__builtin_neon_vqtbl3_v: + case NEON::BI__builtin_neon_vqtbl3q_v: + Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break; + case NEON::BI__builtin_neon_vqtbl4_v: + case NEON::BI__builtin_neon_vqtbl4q_v: + Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break; + case NEON::BI__builtin_neon_vqtbx1_v: + case NEON::BI__builtin_neon_vqtbx1q_v: + Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break; + case NEON::BI__builtin_neon_vqtbx2_v: + case NEON::BI__builtin_neon_vqtbx2q_v: + Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break; + case NEON::BI__builtin_neon_vqtbx3_v: + case NEON::BI__builtin_neon_vqtbx3q_v: + Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break; + case NEON::BI__builtin_neon_vqtbx4_v: + case NEON::BI__builtin_neon_vqtbx4q_v: + Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break; + } + } + + if (!Int) + return nullptr; + + Function *F = CGF.CGM.getIntrinsic(Int, Ty); + return CGF.EmitNeonCall(F, Ops, s); +} + +Value *CodeGenFunction::vectorWrapScalar16(Value *Op) { + llvm::Type *VTy = llvm::VectorType::get(Int16Ty, 4); + Op = Builder.CreateBitCast(Op, Int16Ty); + Value *V = UndefValue::get(VTy); + llvm::Constant *CI = ConstantInt::get(SizeTy, 0); + Op = Builder.CreateInsertElement(V, Op, CI); + return Op; +} + +Value *CodeGenFunction::vectorWrapScalar8(Value *Op) { + llvm::Type *VTy = llvm::VectorType::get(Int8Ty, 8); + Op = Builder.CreateBitCast(Op, Int8Ty); + Value *V = UndefValue::get(VTy); + llvm::Constant *CI = ConstantInt::get(SizeTy, 0); + Op = Builder.CreateInsertElement(V, Op, CI); + return Op; +} + +Value *CodeGenFunction:: +emitVectorWrappedScalar8Intrinsic(unsigned Int, SmallVectorImpl<Value*> &Ops, + const char *Name) { + // i8 is not a legal types for AArch64, so we can't just use + // a normal overloaded intrinsic call for these scalar types. Instead + // we'll build 64-bit vectors w/ lane zero being our input values and + // perform the operation on that. The back end can pattern match directly + // to the scalar instruction. + Ops[0] = vectorWrapScalar8(Ops[0]); + Ops[1] = vectorWrapScalar8(Ops[1]); + llvm::Type *VTy = llvm::VectorType::get(Int8Ty, 8); + Value *V = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, Name); + Constant *CI = ConstantInt::get(SizeTy, 0); + return Builder.CreateExtractElement(V, CI, "lane0"); +} + +Value *CodeGenFunction:: +emitVectorWrappedScalar16Intrinsic(unsigned Int, SmallVectorImpl<Value*> &Ops, + const char *Name) { + // i16 is not a legal types for AArch64, so we can't just use + // a normal overloaded intrinsic call for these scalar types. Instead + // we'll build 64-bit vectors w/ lane zero being our input values and + // perform the operation on that. The back end can pattern match directly + // to the scalar instruction. + Ops[0] = vectorWrapScalar16(Ops[0]); + Ops[1] = vectorWrapScalar16(Ops[1]); + llvm::Type *VTy = llvm::VectorType::get(Int16Ty, 4); + Value *V = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, Name); + Constant *CI = ConstantInt::get(SizeTy, 0); + return Builder.CreateExtractElement(V, CI, "lane0"); +} + +Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + unsigned HintID = static_cast<unsigned>(-1); + switch (BuiltinID) { + default: break; + case AArch64::BI__builtin_arm_nop: + HintID = 0; + break; + case AArch64::BI__builtin_arm_yield: + HintID = 1; + break; + case AArch64::BI__builtin_arm_wfe: + HintID = 2; + break; + case AArch64::BI__builtin_arm_wfi: + HintID = 3; + break; + case AArch64::BI__builtin_arm_sev: + HintID = 4; + break; + case AArch64::BI__builtin_arm_sevl: + HintID = 5; + break; + } + + if (HintID != static_cast<unsigned>(-1)) { + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hint); + return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID)); + } + + if (BuiltinID == AArch64::BI__builtin_arm_rbit) { + assert((getContext().getTypeSize(E->getType()) == 32) && + "rbit of unusual size!"); + llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); + return Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::aarch64_rbit, Arg->getType()), Arg, "rbit"); + } + if (BuiltinID == AArch64::BI__builtin_arm_rbit64) { + assert((getContext().getTypeSize(E->getType()) == 64) && + "rbit of unusual size!"); + llvm::Value *Arg = EmitScalarExpr(E->getArg(0)); + return Builder.CreateCall( + CGM.getIntrinsic(Intrinsic::aarch64_rbit, Arg->getType()), Arg, "rbit"); + } + + if (BuiltinID == AArch64::BI__clear_cache) { + assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments"); + const FunctionDecl *FD = E->getDirectCallee(); + SmallVector<Value*, 2> Ops; + for (unsigned i = 0; i < 2; i++) + Ops.push_back(EmitScalarExpr(E->getArg(i))); + llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType()); + llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty); + StringRef Name = FD->getName(); + return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops); + } + + if ((BuiltinID == AArch64::BI__builtin_arm_ldrex || + BuiltinID == AArch64::BI__builtin_arm_ldaex) && + getContext().getTypeSize(E->getType()) == 128) { + Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_ldaex + ? Intrinsic::aarch64_ldaxp + : Intrinsic::aarch64_ldxp); + + Value *LdPtr = EmitScalarExpr(E->getArg(0)); + Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy), + "ldxp"); + + Value *Val0 = Builder.CreateExtractValue(Val, 1); + Value *Val1 = Builder.CreateExtractValue(Val, 0); + llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128); + Val0 = Builder.CreateZExt(Val0, Int128Ty); + Val1 = Builder.CreateZExt(Val1, Int128Ty); + + Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64); + Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */); + Val = Builder.CreateOr(Val, Val1); + return Builder.CreateBitCast(Val, ConvertType(E->getType())); + } else if (BuiltinID == AArch64::BI__builtin_arm_ldrex || + BuiltinID == AArch64::BI__builtin_arm_ldaex) { + Value *LoadAddr = EmitScalarExpr(E->getArg(0)); + + QualType Ty = E->getType(); + llvm::Type *RealResTy = ConvertType(Ty); + llvm::Type *IntResTy = llvm::IntegerType::get(getLLVMContext(), + getContext().getTypeSize(Ty)); + LoadAddr = Builder.CreateBitCast(LoadAddr, IntResTy->getPointerTo()); + + Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_ldaex + ? Intrinsic::aarch64_ldaxr + : Intrinsic::aarch64_ldxr, + LoadAddr->getType()); + Value *Val = Builder.CreateCall(F, LoadAddr, "ldxr"); + + if (RealResTy->isPointerTy()) + return Builder.CreateIntToPtr(Val, RealResTy); + + Val = Builder.CreateTruncOrBitCast(Val, IntResTy); + return Builder.CreateBitCast(Val, RealResTy); + } + + if ((BuiltinID == AArch64::BI__builtin_arm_strex || + BuiltinID == AArch64::BI__builtin_arm_stlex) && + getContext().getTypeSize(E->getArg(0)->getType()) == 128) { + Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_stlex + ? Intrinsic::aarch64_stlxp + : Intrinsic::aarch64_stxp); + llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty, NULL); + + Value *One = llvm::ConstantInt::get(Int32Ty, 1); + Value *Tmp = Builder.CreateAlloca(ConvertType(E->getArg(0)->getType()), + One); + Value *Val = EmitScalarExpr(E->getArg(0)); + Builder.CreateStore(Val, Tmp); + + Value *LdPtr = Builder.CreateBitCast(Tmp,llvm::PointerType::getUnqual(STy)); + Val = Builder.CreateLoad(LdPtr); + + Value *Arg0 = Builder.CreateExtractValue(Val, 0); + Value *Arg1 = Builder.CreateExtractValue(Val, 1); + Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), + Int8PtrTy); + return Builder.CreateCall3(F, Arg0, Arg1, StPtr, "stxp"); + } else if (BuiltinID == AArch64::BI__builtin_arm_strex || + BuiltinID == AArch64::BI__builtin_arm_stlex) { + Value *StoreVal = EmitScalarExpr(E->getArg(0)); + Value *StoreAddr = EmitScalarExpr(E->getArg(1)); + + QualType Ty = E->getArg(0)->getType(); + llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(), + getContext().getTypeSize(Ty)); + StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo()); + + if (StoreVal->getType()->isPointerTy()) + StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty); + else { + StoreVal = Builder.CreateBitCast(StoreVal, StoreTy); + StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty); + } + + Function *F = CGM.getIntrinsic(BuiltinID == AArch64::BI__builtin_arm_stlex + ? Intrinsic::aarch64_stlxr + : Intrinsic::aarch64_stxr, + StoreAddr->getType()); + return Builder.CreateCall2(F, StoreVal, StoreAddr, "stxr"); + } + + if (BuiltinID == AArch64::BI__builtin_arm_clrex) { + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_clrex); + return Builder.CreateCall(F); + } + + // CRC32 + Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic; + switch (BuiltinID) { + case AArch64::BI__builtin_arm_crc32b: + CRCIntrinsicID = Intrinsic::aarch64_crc32b; break; + case AArch64::BI__builtin_arm_crc32cb: + CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break; + case AArch64::BI__builtin_arm_crc32h: + CRCIntrinsicID = Intrinsic::aarch64_crc32h; break; + case AArch64::BI__builtin_arm_crc32ch: + CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break; + case AArch64::BI__builtin_arm_crc32w: + CRCIntrinsicID = Intrinsic::aarch64_crc32w; break; + case AArch64::BI__builtin_arm_crc32cw: + CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break; + case AArch64::BI__builtin_arm_crc32d: + CRCIntrinsicID = Intrinsic::aarch64_crc32x; break; + case AArch64::BI__builtin_arm_crc32cd: + CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break; + } + + if (CRCIntrinsicID != Intrinsic::not_intrinsic) { + Value *Arg0 = EmitScalarExpr(E->getArg(0)); + Value *Arg1 = EmitScalarExpr(E->getArg(1)); + Function *F = CGM.getIntrinsic(CRCIntrinsicID); + + llvm::Type *DataTy = F->getFunctionType()->getParamType(1); + Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy); + + return Builder.CreateCall2(F, Arg0, Arg1); + } + + llvm::SmallVector<Value*, 4> Ops; + for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) + Ops.push_back(EmitScalarExpr(E->getArg(i))); + + ArrayRef<NeonIntrinsicInfo> SISDMap(AArch64SISDIntrinsicMap); + const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap( + SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted); + + if (Builtin) { + Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1))); + Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E); + assert(Result && "SISD intrinsic should have been handled"); + return Result; + } + + llvm::APSInt Result; + const Expr *Arg = E->getArg(E->getNumArgs()-1); + NeonTypeFlags Type(0); + if (Arg->isIntegerConstantExpr(Result, getContext())) + // Determine the type of this overloaded NEON intrinsic. + Type = NeonTypeFlags(Result.getZExtValue()); + + bool usgn = Type.isUnsigned(); + bool quad = Type.isQuad(); + + // Handle non-overloaded intrinsics first. + switch (BuiltinID) { + default: break; + case NEON::BI__builtin_neon_vldrq_p128: { + llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128); + Value *Ptr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int128PTy); + return Builder.CreateLoad(Ptr); + } + case NEON::BI__builtin_neon_vstrq_p128: { + llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128); + Value *Ptr = Builder.CreateBitCast(Ops[0], Int128PTy); + return Builder.CreateStore(EmitScalarExpr(E->getArg(1)), Ptr); + } + case NEON::BI__builtin_neon_vcvts_u32_f32: + case NEON::BI__builtin_neon_vcvtd_u64_f64: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvts_s32_f32: + case NEON::BI__builtin_neon_vcvtd_s64_f64: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64; + llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty; + llvm::Type *FTy = Is64 ? DoubleTy : FloatTy; + Ops[0] = Builder.CreateBitCast(Ops[0], FTy); + if (usgn) + return Builder.CreateFPToUI(Ops[0], InTy); + return Builder.CreateFPToSI(Ops[0], InTy); + } + case NEON::BI__builtin_neon_vcvts_f32_u32: + case NEON::BI__builtin_neon_vcvtd_f64_u64: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvts_f32_s32: + case NEON::BI__builtin_neon_vcvtd_f64_s64: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64; + llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty; + llvm::Type *FTy = Is64 ? DoubleTy : FloatTy; + Ops[0] = Builder.CreateBitCast(Ops[0], InTy); + if (usgn) + return Builder.CreateUIToFP(Ops[0], FTy); + return Builder.CreateSIToFP(Ops[0], FTy); + } + case NEON::BI__builtin_neon_vpaddd_s64: { + llvm::Type *Ty = + llvm::VectorType::get(llvm::Type::getInt64Ty(getLLVMContext()), 2); + Value *Vec = EmitScalarExpr(E->getArg(0)); + // The vector is v2f64, so make sure it's bitcast to that. + Vec = Builder.CreateBitCast(Vec, Ty, "v2i64"); + llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0); + llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1); + Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0"); + Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1"); + // Pairwise addition of a v2f64 into a scalar f64. + return Builder.CreateAdd(Op0, Op1, "vpaddd"); + } + case NEON::BI__builtin_neon_vpaddd_f64: { + llvm::Type *Ty = + llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2); + Value *Vec = EmitScalarExpr(E->getArg(0)); + // The vector is v2f64, so make sure it's bitcast to that. + Vec = Builder.CreateBitCast(Vec, Ty, "v2f64"); + llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0); + llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1); + Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0"); + Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1"); + // Pairwise addition of a v2f64 into a scalar f64. + return Builder.CreateFAdd(Op0, Op1, "vpaddd"); + } + case NEON::BI__builtin_neon_vpadds_f32: { + llvm::Type *Ty = + llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 2); + Value *Vec = EmitScalarExpr(E->getArg(0)); + // The vector is v2f32, so make sure it's bitcast to that. + Vec = Builder.CreateBitCast(Vec, Ty, "v2f32"); + llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0); + llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1); + Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0"); + Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1"); + // Pairwise addition of a v2f32 into a scalar f32. + return Builder.CreateFAdd(Op0, Op1, "vpaddd"); + } + case NEON::BI__builtin_neon_vceqzd_s64: + case NEON::BI__builtin_neon_vceqzd_f64: + case NEON::BI__builtin_neon_vceqzs_f32: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OEQ, + ICmpInst::ICMP_EQ, "vceqz"); + case NEON::BI__builtin_neon_vcgezd_s64: + case NEON::BI__builtin_neon_vcgezd_f64: + case NEON::BI__builtin_neon_vcgezs_f32: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OGE, + ICmpInst::ICMP_SGE, "vcgez"); + case NEON::BI__builtin_neon_vclezd_s64: + case NEON::BI__builtin_neon_vclezd_f64: + case NEON::BI__builtin_neon_vclezs_f32: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OLE, + ICmpInst::ICMP_SLE, "vclez"); + case NEON::BI__builtin_neon_vcgtzd_s64: + case NEON::BI__builtin_neon_vcgtzd_f64: + case NEON::BI__builtin_neon_vcgtzs_f32: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OGT, + ICmpInst::ICMP_SGT, "vcgtz"); + case NEON::BI__builtin_neon_vcltzd_s64: + case NEON::BI__builtin_neon_vcltzd_f64: + case NEON::BI__builtin_neon_vcltzs_f32: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OLT, + ICmpInst::ICMP_SLT, "vcltz"); + + case NEON::BI__builtin_neon_vceqzd_u64: { + llvm::Type *Ty = llvm::Type::getInt64Ty(getLLVMContext()); + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[0] = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, Ops[0], + llvm::Constant::getNullValue(Ty)); + return Builder.CreateSExt(Ops[0], Ty, "vceqzd"); + } + case NEON::BI__builtin_neon_vceqd_f64: + case NEON::BI__builtin_neon_vcled_f64: + case NEON::BI__builtin_neon_vcltd_f64: + case NEON::BI__builtin_neon_vcged_f64: + case NEON::BI__builtin_neon_vcgtd_f64: { + llvm::CmpInst::Predicate P; + switch (BuiltinID) { + default: llvm_unreachable("missing builtin ID in switch!"); + case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break; + case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break; + case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break; + case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break; + case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break; + } + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); + Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy); + Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]); + return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd"); + } + case NEON::BI__builtin_neon_vceqs_f32: + case NEON::BI__builtin_neon_vcles_f32: + case NEON::BI__builtin_neon_vclts_f32: + case NEON::BI__builtin_neon_vcges_f32: + case NEON::BI__builtin_neon_vcgts_f32: { + llvm::CmpInst::Predicate P; + switch (BuiltinID) { + default: llvm_unreachable("missing builtin ID in switch!"); + case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break; + case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break; + case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break; + case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break; + case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break; + } + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy); + Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy); + Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]); + return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd"); + } + case NEON::BI__builtin_neon_vceqd_s64: + case NEON::BI__builtin_neon_vceqd_u64: + case NEON::BI__builtin_neon_vcgtd_s64: + case NEON::BI__builtin_neon_vcgtd_u64: + case NEON::BI__builtin_neon_vcltd_s64: + case NEON::BI__builtin_neon_vcltd_u64: + case NEON::BI__builtin_neon_vcged_u64: + case NEON::BI__builtin_neon_vcged_s64: + case NEON::BI__builtin_neon_vcled_u64: + case NEON::BI__builtin_neon_vcled_s64: { + llvm::CmpInst::Predicate P; + switch (BuiltinID) { + default: llvm_unreachable("missing builtin ID in switch!"); + case NEON::BI__builtin_neon_vceqd_s64: + case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break; + case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break; + case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break; + case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break; + case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break; + case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break; + case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break; + case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break; + case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break; + } + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty); + Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty); + Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]); + return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd"); + } + case NEON::BI__builtin_neon_vtstd_s64: + case NEON::BI__builtin_neon_vtstd_u64: { + llvm::Type *Ty = llvm::Type::getInt64Ty(getLLVMContext()); + Ops.push_back(EmitScalarExpr(E->getArg(1))); Ops[0] = Builder.CreateBitCast(Ops[0], Ty); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]); Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0], - ConstantAggregateZero::get(Ty)); - return Builder.CreateSExt(Ops[0], Ty, "vtst"); + llvm::Constant::getNullValue(Ty)); + return Builder.CreateSExt(Ops[0], Ty, "vtstd"); + } + case NEON::BI__builtin_neon_vset_lane_i8: + case NEON::BI__builtin_neon_vset_lane_i16: + case NEON::BI__builtin_neon_vset_lane_i32: + case NEON::BI__builtin_neon_vset_lane_i64: + case NEON::BI__builtin_neon_vset_lane_f32: + case NEON::BI__builtin_neon_vsetq_lane_i8: + case NEON::BI__builtin_neon_vsetq_lane_i16: + case NEON::BI__builtin_neon_vsetq_lane_i32: + case NEON::BI__builtin_neon_vsetq_lane_i64: + case NEON::BI__builtin_neon_vsetq_lane_f32: + Ops.push_back(EmitScalarExpr(E->getArg(2))); + return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); + case NEON::BI__builtin_neon_vset_lane_f64: + // The vector type needs a cast for the v1f64 variant. + Ops[1] = Builder.CreateBitCast(Ops[1], + llvm::VectorType::get(DoubleTy, 1)); + Ops.push_back(EmitScalarExpr(E->getArg(2))); + return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); + case NEON::BI__builtin_neon_vsetq_lane_f64: + // The vector type needs a cast for the v2f64 variant. + Ops[1] = Builder.CreateBitCast(Ops[1], + llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2)); + Ops.push_back(EmitScalarExpr(E->getArg(2))); + return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); + + case NEON::BI__builtin_neon_vget_lane_i8: + case NEON::BI__builtin_neon_vdupb_lane_i8: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vget_lane"); + case NEON::BI__builtin_neon_vgetq_lane_i8: + case NEON::BI__builtin_neon_vdupb_laneq_i8: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vgetq_lane"); + case NEON::BI__builtin_neon_vget_lane_i16: + case NEON::BI__builtin_neon_vduph_lane_i16: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vget_lane"); + case NEON::BI__builtin_neon_vgetq_lane_i16: + case NEON::BI__builtin_neon_vduph_laneq_i16: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vgetq_lane"); + case NEON::BI__builtin_neon_vget_lane_i32: + case NEON::BI__builtin_neon_vdups_lane_i32: + Ops[0] = Builder.CreateBitCast( + Ops[0], + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 32), 2)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vget_lane"); + case NEON::BI__builtin_neon_vdups_lane_f32: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 2)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vdups_lane"); + case NEON::BI__builtin_neon_vgetq_lane_i32: + case NEON::BI__builtin_neon_vdups_laneq_i32: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 32), 4)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vgetq_lane"); + case NEON::BI__builtin_neon_vget_lane_i64: + case NEON::BI__builtin_neon_vdupd_lane_i64: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 64), 1)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vget_lane"); + case NEON::BI__builtin_neon_vdupd_lane_f64: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 1)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vdupd_lane"); + case NEON::BI__builtin_neon_vgetq_lane_i64: + case NEON::BI__builtin_neon_vdupd_laneq_i64: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 64), 2)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vgetq_lane"); + case NEON::BI__builtin_neon_vget_lane_f32: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 2)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vget_lane"); + case NEON::BI__builtin_neon_vget_lane_f64: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 1)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vget_lane"); + case NEON::BI__builtin_neon_vgetq_lane_f32: + case NEON::BI__builtin_neon_vdups_laneq_f32: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 4)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vgetq_lane"); + case NEON::BI__builtin_neon_vgetq_lane_f64: + case NEON::BI__builtin_neon_vdupd_laneq_f64: + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vgetq_lane"); + case NEON::BI__builtin_neon_vaddd_s64: + case NEON::BI__builtin_neon_vaddd_u64: + return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd"); + case NEON::BI__builtin_neon_vsubd_s64: + case NEON::BI__builtin_neon_vsubd_u64: + return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd"); + case NEON::BI__builtin_neon_vqdmlalh_s16: + case NEON::BI__builtin_neon_vqdmlslh_s16: { + SmallVector<Value *, 2> ProductOps; + ProductOps.push_back(vectorWrapScalar16(Ops[1])); + ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2)))); + llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4); + Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy), + ProductOps, "vqdmlXl"); + Constant *CI = ConstantInt::get(SizeTy, 0); + Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0"); + + unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16 + ? Intrinsic::aarch64_neon_sqadd + : Intrinsic::aarch64_neon_sqsub; + return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl"); + } + case NEON::BI__builtin_neon_vqshlud_n_s64: { + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, Int64Ty), + Ops, "vqshlu_n"); + } + case NEON::BI__builtin_neon_vqshld_n_u64: + case NEON::BI__builtin_neon_vqshld_n_s64: { + unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64 + ? Intrinsic::aarch64_neon_uqshl + : Intrinsic::aarch64_neon_sqshl; + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty); + return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n"); + } + case NEON::BI__builtin_neon_vrshrd_n_u64: + case NEON::BI__builtin_neon_vrshrd_n_s64: { + unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64 + ? Intrinsic::aarch64_neon_urshl + : Intrinsic::aarch64_neon_srshl; + Ops.push_back(EmitScalarExpr(E->getArg(1))); + int SV = cast<ConstantInt>(Ops[1])->getSExtValue(); + Ops[1] = ConstantInt::get(Int64Ty, -SV); + return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n"); + } + case NEON::BI__builtin_neon_vrsrad_n_u64: + case NEON::BI__builtin_neon_vrsrad_n_s64: { + unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64 + ? Intrinsic::aarch64_neon_urshl + : Intrinsic::aarch64_neon_srshl; + Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty); + Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2)))); + Ops[1] = Builder.CreateCall2(CGM.getIntrinsic(Int, Int64Ty), Ops[1], + Builder.CreateSExt(Ops[2], Int64Ty)); + return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty)); + } + case NEON::BI__builtin_neon_vshld_n_s64: + case NEON::BI__builtin_neon_vshld_n_u64: { + llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1))); + return Builder.CreateShl( + Ops[0], ConstantInt::get(Int64Ty, Amt->getZExtValue()), "shld_n"); + } + case NEON::BI__builtin_neon_vshrd_n_s64: { + llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1))); + return Builder.CreateAShr( + Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63), + Amt->getZExtValue())), + "shrd_n"); + } + case NEON::BI__builtin_neon_vshrd_n_u64: { + llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1))); + uint64_t ShiftAmt = Amt->getZExtValue(); + // Right-shifting an unsigned value by its size yields 0. + if (ShiftAmt == 64) + return ConstantInt::get(Int64Ty, 0); + return Builder.CreateLShr(Ops[0], ConstantInt::get(Int64Ty, ShiftAmt), + "shrd_n"); + } + case NEON::BI__builtin_neon_vsrad_n_s64: { + llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2))); + Ops[1] = Builder.CreateAShr( + Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63), + Amt->getZExtValue())), + "shrd_n"); + return Builder.CreateAdd(Ops[0], Ops[1]); + } + case NEON::BI__builtin_neon_vsrad_n_u64: { + llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2))); + uint64_t ShiftAmt = Amt->getZExtValue(); + // Right-shifting an unsigned value by its size yields 0. + // As Op + 0 = Op, return Ops[0] directly. + if (ShiftAmt == 64) + return Ops[0]; + Ops[1] = Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, ShiftAmt), + "shrd_n"); + return Builder.CreateAdd(Ops[0], Ops[1]); + } + case NEON::BI__builtin_neon_vqdmlalh_lane_s16: + case NEON::BI__builtin_neon_vqdmlalh_laneq_s16: + case NEON::BI__builtin_neon_vqdmlslh_lane_s16: + case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: { + Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)), + "lane"); + SmallVector<Value *, 2> ProductOps; + ProductOps.push_back(vectorWrapScalar16(Ops[1])); + ProductOps.push_back(vectorWrapScalar16(Ops[2])); + llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4); + Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy), + ProductOps, "vqdmlXl"); + Constant *CI = ConstantInt::get(SizeTy, 0); + Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0"); + Ops.pop_back(); + + unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 || + BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16) + ? Intrinsic::aarch64_neon_sqadd + : Intrinsic::aarch64_neon_sqsub; + return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl"); + } + case NEON::BI__builtin_neon_vqdmlals_s32: + case NEON::BI__builtin_neon_vqdmlsls_s32: { + SmallVector<Value *, 2> ProductOps; + ProductOps.push_back(Ops[1]); + ProductOps.push_back(EmitScalarExpr(E->getArg(2))); + Ops[1] = + EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar), + ProductOps, "vqdmlXl"); + + unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32 + ? Intrinsic::aarch64_neon_sqadd + : Intrinsic::aarch64_neon_sqsub; + return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl"); + } + case NEON::BI__builtin_neon_vqdmlals_lane_s32: + case NEON::BI__builtin_neon_vqdmlals_laneq_s32: + case NEON::BI__builtin_neon_vqdmlsls_lane_s32: + case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: { + Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)), + "lane"); + SmallVector<Value *, 2> ProductOps; + ProductOps.push_back(Ops[1]); + ProductOps.push_back(Ops[2]); + Ops[1] = + EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar), + ProductOps, "vqdmlXl"); + Ops.pop_back(); + + unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 || + BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32) + ? Intrinsic::aarch64_neon_sqadd + : Intrinsic::aarch64_neon_sqsub; + return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl"); + } + } + + llvm::VectorType *VTy = GetNeonType(this, Type); + llvm::Type *Ty = VTy; + if (!Ty) + return nullptr; + + // Not all intrinsics handled by the common case work for AArch64 yet, so only + // defer to common code if it's been added to our special map. + Builtin = findNeonIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID, + AArch64SIMDIntrinsicsProvenSorted); + + if (Builtin) + return EmitCommonNeonBuiltinExpr( + Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic, + Builtin->NameHint, Builtin->TypeModifier, E, Ops, nullptr); + + if (Value *V = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops)) + return V; + + unsigned Int; + switch (BuiltinID) { + default: return nullptr; + case NEON::BI__builtin_neon_vbsl_v: + case NEON::BI__builtin_neon_vbslq_v: { + llvm::Type *BitTy = llvm::VectorType::getInteger(VTy); + Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl"); + Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl"); + Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl"); + + Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl"); + Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl"); + Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl"); + return Builder.CreateBitCast(Ops[0], Ty); + } + case NEON::BI__builtin_neon_vfma_lane_v: + case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types + // The ARM builtins (and instructions) have the addend as the first + // operand, but the 'fma' intrinsics have it last. Swap it around here. + Value *Addend = Ops[0]; + Value *Multiplicand = Ops[1]; + Value *LaneSource = Ops[2]; + Ops[0] = Multiplicand; + Ops[1] = LaneSource; + Ops[2] = Addend; + + // Now adjust things to handle the lane access. + llvm::Type *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v ? + llvm::VectorType::get(VTy->getElementType(), VTy->getNumElements() / 2) : + VTy; + llvm::Constant *cst = cast<Constant>(Ops[3]); + Value *SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), cst); + Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy); + Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane"); + + Ops.pop_back(); + Int = Intrinsic::fma; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla"); + } + case NEON::BI__builtin_neon_vfma_laneq_v: { + llvm::VectorType *VTy = cast<llvm::VectorType>(Ty); + // v1f64 fma should be mapped to Neon scalar f64 fma + if (VTy && VTy->getElementType() == DoubleTy) { + Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); + Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy); + llvm::Type *VTy = GetNeonType(this, + NeonTypeFlags(NeonTypeFlags::Float64, false, true)); + Ops[2] = Builder.CreateBitCast(Ops[2], VTy); + Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); + Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy); + Value *Result = Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]); + return Builder.CreateBitCast(Result, Ty); + } + Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + + llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(), + VTy->getNumElements() * 2); + Ops[2] = Builder.CreateBitCast(Ops[2], STy); + Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), + cast<ConstantInt>(Ops[3])); + Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane"); + + return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vfmaq_laneq_v: { + Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3])); + return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vfmas_lane_f32: + case NEON::BI__builtin_neon_vfmas_laneq_f32: + case NEON::BI__builtin_neon_vfmad_lane_f64: + case NEON::BI__builtin_neon_vfmad_laneq_f64: { + Ops.push_back(EmitScalarExpr(E->getArg(3))); + llvm::Type *Ty = ConvertType(E->getCallReturnType()); + Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); + Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); + return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]); + } + case NEON::BI__builtin_neon_vfms_v: + case NEON::BI__builtin_neon_vfmsq_v: { // Only used for FP types + // FIXME: probably remove when we no longer support aarch64_simd.h + // (arm_neon.h delegates to vfma). + + // The ARM builtins (and instructions) have the addend as the first + // operand, but the 'fma' intrinsics have it last. Swap it around here. + Value *Subtrahend = Ops[0]; + Value *Multiplicand = Ops[2]; + Ops[0] = Multiplicand; + Ops[2] = Subtrahend; + Ops[1] = Builder.CreateBitCast(Ops[1], VTy); + Ops[1] = Builder.CreateFNeg(Ops[1]); + Int = Intrinsic::fma; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmls"); + } + case NEON::BI__builtin_neon_vmull_v: + // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. + Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull; + if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull"); + case NEON::BI__builtin_neon_vmax_v: + case NEON::BI__builtin_neon_vmaxq_v: + // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. + Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax; + if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax"); + case NEON::BI__builtin_neon_vmin_v: + case NEON::BI__builtin_neon_vminq_v: + // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. + Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin; + if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin"); + case NEON::BI__builtin_neon_vabd_v: + case NEON::BI__builtin_neon_vabdq_v: + // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. + Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd; + if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd"); + case NEON::BI__builtin_neon_vpadal_v: + case NEON::BI__builtin_neon_vpadalq_v: { + unsigned ArgElts = VTy->getNumElements(); + llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType()); + unsigned BitWidth = EltTy->getBitWidth(); + llvm::Type *ArgTy = llvm::VectorType::get( + llvm::IntegerType::get(getLLVMContext(), BitWidth/2), 2*ArgElts); + llvm::Type* Tys[2] = { VTy, ArgTy }; + Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp; + SmallVector<llvm::Value*, 1> TmpOps; + TmpOps.push_back(Ops[1]); + Function *F = CGM.getIntrinsic(Int, Tys); + llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal"); + llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType()); + return Builder.CreateAdd(tmp, addend); + } + case NEON::BI__builtin_neon_vpmin_v: + case NEON::BI__builtin_neon_vpminq_v: + // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. + Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp; + if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin"); + case NEON::BI__builtin_neon_vpmax_v: + case NEON::BI__builtin_neon_vpmaxq_v: + // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. + Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp; + if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax"); + case NEON::BI__builtin_neon_vminnm_v: + case NEON::BI__builtin_neon_vminnmq_v: + Int = Intrinsic::aarch64_neon_fminnm; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm"); + case NEON::BI__builtin_neon_vmaxnm_v: + case NEON::BI__builtin_neon_vmaxnmq_v: + Int = Intrinsic::aarch64_neon_fmaxnm; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm"); + case NEON::BI__builtin_neon_vrecpss_f32: { + llvm::Type *f32Type = llvm::Type::getFloatTy(getLLVMContext()); + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, f32Type), + Ops, "vrecps"); + } + case NEON::BI__builtin_neon_vrecpsd_f64: { + llvm::Type *f64Type = llvm::Type::getDoubleTy(getLLVMContext()); + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, f64Type), + Ops, "vrecps"); + } + case NEON::BI__builtin_neon_vrshr_n_v: + case NEON::BI__builtin_neon_vrshrq_n_v: + // FIXME: this can be shared with 32-bit ARM, but not AArch64 at the + // moment. After the final merge it should be added to + // EmitCommonNeonBuiltinExpr. + Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n", 1, true); + case NEON::BI__builtin_neon_vqshlu_n_v: + case NEON::BI__builtin_neon_vqshluq_n_v: + // FIXME: AArch64 and ARM use different intrinsics for this, but are + // essentially compatible. It should be in EmitCommonNeonBuiltinExpr after + // the final merge. + Int = Intrinsic::aarch64_neon_sqshlu; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n", 1, false); + case NEON::BI__builtin_neon_vqshrun_n_v: + // FIXME: as above + Int = Intrinsic::aarch64_neon_sqshrun; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n"); + case NEON::BI__builtin_neon_vqrshrun_n_v: + // FIXME: and again. + Int = Intrinsic::aarch64_neon_sqrshrun; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n"); + case NEON::BI__builtin_neon_vqshrn_n_v: + // FIXME: guess + Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n"); + case NEON::BI__builtin_neon_vrshrn_n_v: + // FIXME: there might be a pattern here. + Int = Intrinsic::aarch64_neon_rshrn; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n"); + case NEON::BI__builtin_neon_vqrshrn_n_v: + // FIXME: another one + Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n"); + case NEON::BI__builtin_neon_vrnda_v: + case NEON::BI__builtin_neon_vrndaq_v: { + Int = Intrinsic::round; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda"); + } + case NEON::BI__builtin_neon_vrndi_v: + case NEON::BI__builtin_neon_vrndiq_v: { + Int = Intrinsic::nearbyint; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi"); + } + case NEON::BI__builtin_neon_vrndm_v: + case NEON::BI__builtin_neon_vrndmq_v: { + Int = Intrinsic::floor; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm"); + } + case NEON::BI__builtin_neon_vrndn_v: + case NEON::BI__builtin_neon_vrndnq_v: { + Int = Intrinsic::aarch64_neon_frintn; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn"); + } + case NEON::BI__builtin_neon_vrndp_v: + case NEON::BI__builtin_neon_vrndpq_v: { + Int = Intrinsic::ceil; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp"); + } + case NEON::BI__builtin_neon_vrndx_v: + case NEON::BI__builtin_neon_vrndxq_v: { + Int = Intrinsic::rint; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx"); + } + case NEON::BI__builtin_neon_vrnd_v: + case NEON::BI__builtin_neon_vrndq_v: { + Int = Intrinsic::trunc; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz"); + } + case NEON::BI__builtin_neon_vceqz_v: + case NEON::BI__builtin_neon_vceqzq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ, + ICmpInst::ICMP_EQ, "vceqz"); + case NEON::BI__builtin_neon_vcgez_v: + case NEON::BI__builtin_neon_vcgezq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE, + ICmpInst::ICMP_SGE, "vcgez"); + case NEON::BI__builtin_neon_vclez_v: + case NEON::BI__builtin_neon_vclezq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE, + ICmpInst::ICMP_SLE, "vclez"); + case NEON::BI__builtin_neon_vcgtz_v: + case NEON::BI__builtin_neon_vcgtzq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT, + ICmpInst::ICMP_SGT, "vcgtz"); + case NEON::BI__builtin_neon_vcltz_v: + case NEON::BI__builtin_neon_vcltzq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT, + ICmpInst::ICMP_SLT, "vcltz"); + case NEON::BI__builtin_neon_vcvt_f64_v: + case NEON::BI__builtin_neon_vcvtq_f64_v: + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad)); + return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") + : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); + case NEON::BI__builtin_neon_vcvt_f64_f32: { + assert(Type.getEltType() == NeonTypeFlags::Float64 && quad && + "unexpected vcvt_f64_f32 builtin"); + NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false); + Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag)); + + return Builder.CreateFPExt(Ops[0], Ty, "vcvt"); + } + case NEON::BI__builtin_neon_vcvt_f32_f64: { + assert(Type.getEltType() == NeonTypeFlags::Float32 && + "unexpected vcvt_f32_f64 builtin"); + NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true); + Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag)); + + return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt"); + } + case NEON::BI__builtin_neon_vcvt_s32_v: + case NEON::BI__builtin_neon_vcvt_u32_v: + case NEON::BI__builtin_neon_vcvt_s64_v: + case NEON::BI__builtin_neon_vcvt_u64_v: + case NEON::BI__builtin_neon_vcvtq_s32_v: + case NEON::BI__builtin_neon_vcvtq_u32_v: + case NEON::BI__builtin_neon_vcvtq_s64_v: + case NEON::BI__builtin_neon_vcvtq_u64_v: { + bool Double = + (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64); + llvm::Type *InTy = + GetNeonType(this, + NeonTypeFlags(Double ? NeonTypeFlags::Float64 + : NeonTypeFlags::Float32, false, quad)); + Ops[0] = Builder.CreateBitCast(Ops[0], InTy); + if (usgn) + return Builder.CreateFPToUI(Ops[0], Ty); + return Builder.CreateFPToSI(Ops[0], Ty); + } + case NEON::BI__builtin_neon_vcvta_s32_v: + case NEON::BI__builtin_neon_vcvtaq_s32_v: + case NEON::BI__builtin_neon_vcvta_u32_v: + case NEON::BI__builtin_neon_vcvtaq_u32_v: + case NEON::BI__builtin_neon_vcvta_s64_v: + case NEON::BI__builtin_neon_vcvtaq_s64_v: + case NEON::BI__builtin_neon_vcvta_u64_v: + case NEON::BI__builtin_neon_vcvtaq_u64_v: { + Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas; + bool Double = + (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64); + llvm::Type *InTy = + GetNeonType(this, + NeonTypeFlags(Double ? NeonTypeFlags::Float64 + : NeonTypeFlags::Float32, false, quad)); + llvm::Type *Tys[2] = { Ty, InTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta"); + } + case NEON::BI__builtin_neon_vcvtm_s32_v: + case NEON::BI__builtin_neon_vcvtmq_s32_v: + case NEON::BI__builtin_neon_vcvtm_u32_v: + case NEON::BI__builtin_neon_vcvtmq_u32_v: + case NEON::BI__builtin_neon_vcvtm_s64_v: + case NEON::BI__builtin_neon_vcvtmq_s64_v: + case NEON::BI__builtin_neon_vcvtm_u64_v: + case NEON::BI__builtin_neon_vcvtmq_u64_v: { + Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms; + bool Double = + (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64); + llvm::Type *InTy = + GetNeonType(this, + NeonTypeFlags(Double ? NeonTypeFlags::Float64 + : NeonTypeFlags::Float32, false, quad)); + llvm::Type *Tys[2] = { Ty, InTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm"); + } + case NEON::BI__builtin_neon_vcvtn_s32_v: + case NEON::BI__builtin_neon_vcvtnq_s32_v: + case NEON::BI__builtin_neon_vcvtn_u32_v: + case NEON::BI__builtin_neon_vcvtnq_u32_v: + case NEON::BI__builtin_neon_vcvtn_s64_v: + case NEON::BI__builtin_neon_vcvtnq_s64_v: + case NEON::BI__builtin_neon_vcvtn_u64_v: + case NEON::BI__builtin_neon_vcvtnq_u64_v: { + Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns; + bool Double = + (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64); + llvm::Type *InTy = + GetNeonType(this, + NeonTypeFlags(Double ? NeonTypeFlags::Float64 + : NeonTypeFlags::Float32, false, quad)); + llvm::Type *Tys[2] = { Ty, InTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn"); + } + case NEON::BI__builtin_neon_vcvtp_s32_v: + case NEON::BI__builtin_neon_vcvtpq_s32_v: + case NEON::BI__builtin_neon_vcvtp_u32_v: + case NEON::BI__builtin_neon_vcvtpq_u32_v: + case NEON::BI__builtin_neon_vcvtp_s64_v: + case NEON::BI__builtin_neon_vcvtpq_s64_v: + case NEON::BI__builtin_neon_vcvtp_u64_v: + case NEON::BI__builtin_neon_vcvtpq_u64_v: { + Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps; + bool Double = + (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64); + llvm::Type *InTy = + GetNeonType(this, + NeonTypeFlags(Double ? NeonTypeFlags::Float64 + : NeonTypeFlags::Float32, false, quad)); + llvm::Type *Tys[2] = { Ty, InTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp"); + } + case NEON::BI__builtin_neon_vmulx_v: + case NEON::BI__builtin_neon_vmulxq_v: { + Int = Intrinsic::aarch64_neon_fmulx; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx"); + } + case NEON::BI__builtin_neon_vmul_lane_v: + case NEON::BI__builtin_neon_vmul_laneq_v: { + // v1f64 vmul_lane should be mapped to Neon scalar mul lane + bool Quad = false; + if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v) + Quad = true; + Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); + llvm::Type *VTy = GetNeonType(this, + NeonTypeFlags(NeonTypeFlags::Float64, false, Quad)); + Ops[1] = Builder.CreateBitCast(Ops[1], VTy); + Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); + Value *Result = Builder.CreateFMul(Ops[0], Ops[1]); + return Builder.CreateBitCast(Result, Ty); + } + case NEON::BI__builtin_neon_vnegd_s64: + return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd"); + case NEON::BI__builtin_neon_vpmaxnm_v: + case NEON::BI__builtin_neon_vpmaxnmq_v: { + Int = Intrinsic::aarch64_neon_fmaxnmp; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm"); + } + case NEON::BI__builtin_neon_vpminnm_v: + case NEON::BI__builtin_neon_vpminnmq_v: { + Int = Intrinsic::aarch64_neon_fminnmp; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm"); + } + case NEON::BI__builtin_neon_vsqrt_v: + case NEON::BI__builtin_neon_vsqrtq_v: { + Int = Intrinsic::sqrt; + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt"); + } + case NEON::BI__builtin_neon_vrbit_v: + case NEON::BI__builtin_neon_vrbitq_v: { + Int = Intrinsic::aarch64_neon_rbit; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit"); + } + case NEON::BI__builtin_neon_vaddv_u8: + // FIXME: These are handled by the AArch64 scalar code. + usgn = true; + // FALLTHROUGH + case NEON::BI__builtin_neon_vaddv_s8: { + Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vaddv_u16: + usgn = true; + // FALLTHROUGH + case NEON::BI__builtin_neon_vaddv_s16: { + Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vaddvq_u8: + usgn = true; + // FALLTHROUGH + case NEON::BI__builtin_neon_vaddvq_s8: { + Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vaddvq_u16: + usgn = true; + // FALLTHROUGH + case NEON::BI__builtin_neon_vaddvq_s16: { + Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vmaxv_u8: { + Int = Intrinsic::aarch64_neon_umaxv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vmaxv_u16: { + Int = Intrinsic::aarch64_neon_umaxv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vmaxvq_u8: { + Int = Intrinsic::aarch64_neon_umaxv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vmaxvq_u16: { + Int = Intrinsic::aarch64_neon_umaxv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vmaxv_s8: { + Int = Intrinsic::aarch64_neon_smaxv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vmaxv_s16: { + Int = Intrinsic::aarch64_neon_smaxv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vmaxvq_s8: { + Int = Intrinsic::aarch64_neon_smaxv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vmaxvq_s16: { + Int = Intrinsic::aarch64_neon_smaxv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vminv_u8: { + Int = Intrinsic::aarch64_neon_uminv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vminv_u16: { + Int = Intrinsic::aarch64_neon_uminv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vminvq_u8: { + Int = Intrinsic::aarch64_neon_uminv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vminvq_u16: { + Int = Intrinsic::aarch64_neon_uminv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vminv_s8: { + Int = Intrinsic::aarch64_neon_sminv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vminv_s16: { + Int = Intrinsic::aarch64_neon_sminv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vminvq_s8: { + Int = Intrinsic::aarch64_neon_sminv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 8)); + } + case NEON::BI__builtin_neon_vminvq_s16: { + Int = Intrinsic::aarch64_neon_sminv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vmul_n_f64: { + Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); + Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy); + return Builder.CreateFMul(Ops[0], RHS); + } + case NEON::BI__builtin_neon_vaddlv_u8: { + Int = Intrinsic::aarch64_neon_uaddlv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vaddlv_u16: { + Int = Intrinsic::aarch64_neon_uaddlv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); + } + case NEON::BI__builtin_neon_vaddlvq_u8: { + Int = Intrinsic::aarch64_neon_uaddlv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vaddlvq_u16: { + Int = Intrinsic::aarch64_neon_uaddlv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); + } + case NEON::BI__builtin_neon_vaddlv_s8: { + Int = Intrinsic::aarch64_neon_saddlv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vaddlv_s16: { + Int = Intrinsic::aarch64_neon_saddlv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); + } + case NEON::BI__builtin_neon_vaddlvq_s8: { + Int = Intrinsic::aarch64_neon_saddlv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); + return Builder.CreateTrunc(Ops[0], + llvm::IntegerType::get(getLLVMContext(), 16)); + } + case NEON::BI__builtin_neon_vaddlvq_s16: { + Int = Intrinsic::aarch64_neon_saddlv; + Ty = llvm::IntegerType::get(getLLVMContext(), 32); + VTy = + llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8); + llvm::Type *Tys[2] = { Ty, VTy }; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv"); + } + case NEON::BI__builtin_neon_vsri_n_v: + case NEON::BI__builtin_neon_vsriq_n_v: { + Int = Intrinsic::aarch64_neon_vsri; + llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty); + return EmitNeonCall(Intrin, Ops, "vsri_n"); + } + case NEON::BI__builtin_neon_vsli_n_v: + case NEON::BI__builtin_neon_vsliq_n_v: { + Int = Intrinsic::aarch64_neon_vsli; + llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty); + return EmitNeonCall(Intrin, Ops, "vsli_n"); + } + case NEON::BI__builtin_neon_vsra_n_v: + case NEON::BI__builtin_neon_vsraq_n_v: + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n"); + return Builder.CreateAdd(Ops[0], Ops[1]); + case NEON::BI__builtin_neon_vrsra_n_v: + case NEON::BI__builtin_neon_vrsraq_n_v: { + Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl; + SmallVector<llvm::Value*,2> TmpOps; + TmpOps.push_back(Ops[1]); + TmpOps.push_back(Ops[2]); + Function* F = CGM.getIntrinsic(Int, Ty); + llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true); + Ops[0] = Builder.CreateBitCast(Ops[0], VTy); + return Builder.CreateAdd(Ops[0], tmp); + } + // FIXME: Sharing loads & stores with 32-bit is complicated by the absence + // of an Align parameter here. + case NEON::BI__builtin_neon_vld1_x2_v: + case NEON::BI__builtin_neon_vld1q_x2_v: + case NEON::BI__builtin_neon_vld1_x3_v: + case NEON::BI__builtin_neon_vld1q_x3_v: + case NEON::BI__builtin_neon_vld1_x4_v: + case NEON::BI__builtin_neon_vld1q_x4_v: { + llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType()); + Ops[1] = Builder.CreateBitCast(Ops[1], PTy); + llvm::Type *Tys[2] = { VTy, PTy }; + unsigned Int; + switch (BuiltinID) { + case NEON::BI__builtin_neon_vld1_x2_v: + case NEON::BI__builtin_neon_vld1q_x2_v: + Int = Intrinsic::aarch64_neon_ld1x2; + break; + case NEON::BI__builtin_neon_vld1_x3_v: + case NEON::BI__builtin_neon_vld1q_x3_v: + Int = Intrinsic::aarch64_neon_ld1x3; + break; + case NEON::BI__builtin_neon_vld1_x4_v: + case NEON::BI__builtin_neon_vld1q_x4_v: + Int = Intrinsic::aarch64_neon_ld1x4; + break; + } + Function *F = CGM.getIntrinsic(Int, Tys); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vst1_x2_v: + case NEON::BI__builtin_neon_vst1q_x2_v: + case NEON::BI__builtin_neon_vst1_x3_v: + case NEON::BI__builtin_neon_vst1q_x3_v: + case NEON::BI__builtin_neon_vst1_x4_v: + case NEON::BI__builtin_neon_vst1q_x4_v: { + llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType()); + llvm::Type *Tys[2] = { VTy, PTy }; + unsigned Int; + switch (BuiltinID) { + case NEON::BI__builtin_neon_vst1_x2_v: + case NEON::BI__builtin_neon_vst1q_x2_v: + Int = Intrinsic::aarch64_neon_st1x2; + break; + case NEON::BI__builtin_neon_vst1_x3_v: + case NEON::BI__builtin_neon_vst1q_x3_v: + Int = Intrinsic::aarch64_neon_st1x3; + break; + case NEON::BI__builtin_neon_vst1_x4_v: + case NEON::BI__builtin_neon_vst1q_x4_v: + Int = Intrinsic::aarch64_neon_st1x4; + break; + } + SmallVector<Value *, 4> IntOps(Ops.begin()+1, Ops.end()); + IntOps.push_back(Ops[0]); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), IntOps, ""); + } + case NEON::BI__builtin_neon_vld1_v: + case NEON::BI__builtin_neon_vld1q_v: + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy)); + return Builder.CreateLoad(Ops[0]); + case NEON::BI__builtin_neon_vst1_v: + case NEON::BI__builtin_neon_vst1q_v: + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy)); + Ops[1] = Builder.CreateBitCast(Ops[1], VTy); + return Builder.CreateStore(Ops[1], Ops[0]); + case NEON::BI__builtin_neon_vld1_lane_v: + case NEON::BI__builtin_neon_vld1q_lane_v: + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ty = llvm::PointerType::getUnqual(VTy->getElementType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[0] = Builder.CreateLoad(Ops[0]); + return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane"); + case NEON::BI__builtin_neon_vld1_dup_v: + case NEON::BI__builtin_neon_vld1q_dup_v: { + Value *V = UndefValue::get(Ty); + Ty = llvm::PointerType::getUnqual(VTy->getElementType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[0] = Builder.CreateLoad(Ops[0]); + llvm::Constant *CI = ConstantInt::get(Int32Ty, 0); + Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI); + return EmitNeonSplat(Ops[0], CI); + } + case NEON::BI__builtin_neon_vst1_lane_v: + case NEON::BI__builtin_neon_vst1q_lane_v: + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + return Builder.CreateStore(Ops[1], Builder.CreateBitCast(Ops[0], Ty)); + case NEON::BI__builtin_neon_vld2_v: + case NEON::BI__builtin_neon_vld2q_v: { + llvm::Type *PTy = llvm::PointerType::getUnqual(VTy); + Ops[1] = Builder.CreateBitCast(Ops[1], PTy); + llvm::Type *Tys[2] = { VTy, PTy }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld2"); + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(Ops[1]->getType())); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vld3_v: + case NEON::BI__builtin_neon_vld3q_v: { + llvm::Type *PTy = llvm::PointerType::getUnqual(VTy); + Ops[1] = Builder.CreateBitCast(Ops[1], PTy); + llvm::Type *Tys[2] = { VTy, PTy }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld3"); + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(Ops[1]->getType())); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vld4_v: + case NEON::BI__builtin_neon_vld4q_v: { + llvm::Type *PTy = llvm::PointerType::getUnqual(VTy); + Ops[1] = Builder.CreateBitCast(Ops[1], PTy); + llvm::Type *Tys[2] = { VTy, PTy }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld4"); + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(Ops[1]->getType())); + return Builder.CreateStore(Ops[1], Ops[0]); } - case ARM::BI__builtin_neon_vtrn_v: - case ARM::BI__builtin_neon_vtrnq_v: { + case NEON::BI__builtin_neon_vld2_dup_v: + case NEON::BI__builtin_neon_vld2q_dup_v: { + llvm::Type *PTy = + llvm::PointerType::getUnqual(VTy->getElementType()); + Ops[1] = Builder.CreateBitCast(Ops[1], PTy); + llvm::Type *Tys[2] = { VTy, PTy }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld2"); + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(Ops[1]->getType())); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vld3_dup_v: + case NEON::BI__builtin_neon_vld3q_dup_v: { + llvm::Type *PTy = + llvm::PointerType::getUnqual(VTy->getElementType()); + Ops[1] = Builder.CreateBitCast(Ops[1], PTy); + llvm::Type *Tys[2] = { VTy, PTy }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld3"); + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(Ops[1]->getType())); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vld4_dup_v: + case NEON::BI__builtin_neon_vld4q_dup_v: { + llvm::Type *PTy = + llvm::PointerType::getUnqual(VTy->getElementType()); + Ops[1] = Builder.CreateBitCast(Ops[1], PTy); + llvm::Type *Tys[2] = { VTy, PTy }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld4"); + Ops[0] = Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(Ops[1]->getType())); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vld2_lane_v: + case NEON::BI__builtin_neon_vld2q_lane_v: { + llvm::Type *Tys[2] = { VTy, Ops[1]->getType() }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2lane, Tys); + Ops.push_back(Ops[1]); + Ops.erase(Ops.begin()+1); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ops[3] = Builder.CreateZExt(Ops[3], + llvm::IntegerType::get(getLLVMContext(), 64)); + Ops[1] = Builder.CreateCall(F, + ArrayRef<Value*>(Ops).slice(1), "vld2_lane"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vld3_lane_v: + case NEON::BI__builtin_neon_vld3q_lane_v: { + llvm::Type *Tys[2] = { VTy, Ops[1]->getType() }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3lane, Tys); + Ops.push_back(Ops[1]); + Ops.erase(Ops.begin()+1); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ops[3] = Builder.CreateBitCast(Ops[3], Ty); + Ops[4] = Builder.CreateZExt(Ops[4], + llvm::IntegerType::get(getLLVMContext(), 64)); + Ops[1] = Builder.CreateCall(F, + ArrayRef<Value*>(Ops).slice(1), "vld3_lane"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vld4_lane_v: + case NEON::BI__builtin_neon_vld4q_lane_v: { + llvm::Type *Tys[2] = { VTy, Ops[1]->getType() }; + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4lane, Tys); + Ops.push_back(Ops[1]); + Ops.erase(Ops.begin()+1); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ops[3] = Builder.CreateBitCast(Ops[3], Ty); + Ops[4] = Builder.CreateBitCast(Ops[4], Ty); + Ops[5] = Builder.CreateZExt(Ops[5], + llvm::IntegerType::get(getLLVMContext(), 64)); + Ops[1] = Builder.CreateCall(F, + ArrayRef<Value*>(Ops).slice(1), "vld4_lane"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateStore(Ops[1], Ops[0]); + } + case NEON::BI__builtin_neon_vst2_v: + case NEON::BI__builtin_neon_vst2q_v: { + Ops.push_back(Ops[0]); + Ops.erase(Ops.begin()); + llvm::Type *Tys[2] = { VTy, Ops[2]->getType() }; + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2, Tys), + Ops, ""); + } + case NEON::BI__builtin_neon_vst2_lane_v: + case NEON::BI__builtin_neon_vst2q_lane_v: { + Ops.push_back(Ops[0]); + Ops.erase(Ops.begin()); + Ops[2] = Builder.CreateZExt(Ops[2], + llvm::IntegerType::get(getLLVMContext(), 64)); + llvm::Type *Tys[2] = { VTy, Ops[3]->getType() }; + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2lane, Tys), + Ops, ""); + } + case NEON::BI__builtin_neon_vst3_v: + case NEON::BI__builtin_neon_vst3q_v: { + Ops.push_back(Ops[0]); + Ops.erase(Ops.begin()); + llvm::Type *Tys[2] = { VTy, Ops[3]->getType() }; + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3, Tys), + Ops, ""); + } + case NEON::BI__builtin_neon_vst3_lane_v: + case NEON::BI__builtin_neon_vst3q_lane_v: { + Ops.push_back(Ops[0]); + Ops.erase(Ops.begin()); + Ops[3] = Builder.CreateZExt(Ops[3], + llvm::IntegerType::get(getLLVMContext(), 64)); + llvm::Type *Tys[2] = { VTy, Ops[4]->getType() }; + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3lane, Tys), + Ops, ""); + } + case NEON::BI__builtin_neon_vst4_v: + case NEON::BI__builtin_neon_vst4q_v: { + Ops.push_back(Ops[0]); + Ops.erase(Ops.begin()); + llvm::Type *Tys[2] = { VTy, Ops[4]->getType() }; + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4, Tys), + Ops, ""); + } + case NEON::BI__builtin_neon_vst4_lane_v: + case NEON::BI__builtin_neon_vst4q_lane_v: { + Ops.push_back(Ops[0]); + Ops.erase(Ops.begin()); + Ops[4] = Builder.CreateZExt(Ops[4], + llvm::IntegerType::get(getLLVMContext(), 64)); + llvm::Type *Tys[2] = { VTy, Ops[5]->getType() }; + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4lane, Tys), + Ops, ""); + } + case NEON::BI__builtin_neon_vtrn_v: + case NEON::BI__builtin_neon_vtrnq_v: { Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - Value *SV = 0; + Value *SV = nullptr; for (unsigned vi = 0; vi != 2; ++vi) { SmallVector<Constant*, 16> Indices; for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { - Indices.push_back(Builder.getInt32(i+vi)); - Indices.push_back(Builder.getInt32(i+e+vi)); + Indices.push_back(ConstantInt::get(Int32Ty, i+vi)); + Indices.push_back(ConstantInt::get(Int32Ty, i+e+vi)); } Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi); SV = llvm::ConstantVector::get(Indices); @@ -4875,12 +5550,12 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, } return SV; } - case ARM::BI__builtin_neon_vuzp_v: - case ARM::BI__builtin_neon_vuzpq_v: { + case NEON::BI__builtin_neon_vuzp_v: + case NEON::BI__builtin_neon_vuzpq_v: { Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - Value *SV = 0; + Value *SV = nullptr; for (unsigned vi = 0; vi != 2; ++vi) { SmallVector<Constant*, 16> Indices; @@ -4894,12 +5569,12 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, } return SV; } - case ARM::BI__builtin_neon_vzip_v: - case ARM::BI__builtin_neon_vzipq_v: { + case NEON::BI__builtin_neon_vzip_v: + case NEON::BI__builtin_neon_vzipq_v: { Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - Value *SV = 0; + Value *SV = nullptr; for (unsigned vi = 0; vi != 2; ++vi) { SmallVector<Constant*, 16> Indices; @@ -4914,6 +5589,48 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, } return SV; } + case NEON::BI__builtin_neon_vqtbl1q_v: { + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl1, Ty), + Ops, "vtbl1"); + } + case NEON::BI__builtin_neon_vqtbl2q_v: { + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl2, Ty), + Ops, "vtbl2"); + } + case NEON::BI__builtin_neon_vqtbl3q_v: { + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl3, Ty), + Ops, "vtbl3"); + } + case NEON::BI__builtin_neon_vqtbl4q_v: { + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl4, Ty), + Ops, "vtbl4"); + } + case NEON::BI__builtin_neon_vqtbx1q_v: { + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx1, Ty), + Ops, "vtbx1"); + } + case NEON::BI__builtin_neon_vqtbx2q_v: { + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx2, Ty), + Ops, "vtbx2"); + } + case NEON::BI__builtin_neon_vqtbx3q_v: { + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx3, Ty), + Ops, "vtbx3"); + } + case NEON::BI__builtin_neon_vqtbx4q_v: { + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx4, Ty), + Ops, "vtbx4"); + } + case NEON::BI__builtin_neon_vsqadd_v: + case NEON::BI__builtin_neon_vsqaddq_v: { + Int = Intrinsic::aarch64_neon_usqadd; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd"); + } + case NEON::BI__builtin_neon_vuqadd_v: + case NEON::BI__builtin_neon_vuqaddq_v: { + Int = Intrinsic::aarch64_neon_suqadd; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd"); + } } } @@ -4969,7 +5686,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, } switch (BuiltinID) { - default: return 0; + default: return nullptr; + case X86::BI_mm_prefetch: { + Value *Address = EmitScalarExpr(E->getArg(0)); + Value *RW = ConstantInt::get(Int32Ty, 0); + Value *Locality = EmitScalarExpr(E->getArg(1)); + Value *Data = ConstantInt::get(Int32Ty, 1); + Value *F = CGM.getIntrinsic(Intrinsic::prefetch); + return Builder.CreateCall4(F, Address, RW, Locality, Data); + } case X86::BI__builtin_ia32_vec_init_v8qi: case X86::BI__builtin_ia32_vec_init_v4hi: case X86::BI__builtin_ia32_vec_init_v2si: @@ -5000,7 +5725,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // extract (0, 1) unsigned Index = BuiltinID == X86::BI__builtin_ia32_storelps ? 0 : 1; - llvm::Value *Idx = llvm::ConstantInt::get(Int32Ty, Index); + llvm::Value *Idx = llvm::ConstantInt::get(SizeTy, Index); Ops[1] = Builder.CreateExtractElement(Ops[1], Idx, "extract"); // cast pointer to i64 & store @@ -5138,7 +5863,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // 3DNow! case X86::BI__builtin_ia32_pswapdsf: case X86::BI__builtin_ia32_pswapdsi: { - const char *name = 0; + const char *name = nullptr; Intrinsic::ID ID = Intrinsic::not_intrinsic; switch(BuiltinID) { default: llvm_unreachable("Unsupported intrinsic!"); @@ -5207,7 +5932,7 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID, Intrinsic::ID ID = Intrinsic::not_intrinsic; switch (BuiltinID) { - default: return 0; + default: return nullptr; // vec_ld, vec_lvsl, vec_lvsr case PPC::BI__builtin_altivec_lvx: @@ -5285,3 +6010,83 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID, } } } + +// Emit an intrinsic that has 1 float or double. +static Value *emitUnaryFPBuiltin(CodeGenFunction &CGF, + const CallExpr *E, + unsigned IntrinsicID) { + llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); + + Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + return CGF.Builder.CreateCall(F, Src0); +} + +// Emit an intrinsic that has 3 float or double operands. +static Value *emitTernaryFPBuiltin(CodeGenFunction &CGF, + const CallExpr *E, + unsigned IntrinsicID) { + llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2)); + + Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + return CGF.Builder.CreateCall3(F, Src0, Src1, Src2); +} + +Value *CodeGenFunction::EmitR600BuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + switch (BuiltinID) { + case R600::BI__builtin_amdgpu_div_scale: + case R600::BI__builtin_amdgpu_div_scalef: { + // Translate from the intrinsics's struct return to the builtin's out + // argument. + + std::pair<llvm::Value *, unsigned> FlagOutPtr + = EmitPointerWithAlignment(E->getArg(3)); + + llvm::Value *X = EmitScalarExpr(E->getArg(0)); + llvm::Value *Y = EmitScalarExpr(E->getArg(1)); + llvm::Value *Z = EmitScalarExpr(E->getArg(2)); + + llvm::Value *Callee = CGM.getIntrinsic(Intrinsic::AMDGPU_div_scale, + X->getType()); + + llvm::Value *Tmp = Builder.CreateCall3(Callee, X, Y, Z); + + llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0); + llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1); + + llvm::Type *RealFlagType + = FlagOutPtr.first->getType()->getPointerElementType(); + + llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType); + llvm::StoreInst *FlagStore = Builder.CreateStore(FlagExt, FlagOutPtr.first); + FlagStore->setAlignment(FlagOutPtr.second); + return Result; + } + case R600::BI__builtin_amdgpu_div_fmas: + case R600::BI__builtin_amdgpu_div_fmasf: + return emitTernaryFPBuiltin(*this, E, Intrinsic::AMDGPU_div_fmas); + case R600::BI__builtin_amdgpu_div_fixup: + case R600::BI__builtin_amdgpu_div_fixupf: + return emitTernaryFPBuiltin(*this, E, Intrinsic::AMDGPU_div_fixup); + case R600::BI__builtin_amdgpu_trig_preop: + case R600::BI__builtin_amdgpu_trig_preopf: { + Value *Src0 = EmitScalarExpr(E->getArg(0)); + Value *Src1 = EmitScalarExpr(E->getArg(1)); + Value *F = CGM.getIntrinsic(Intrinsic::AMDGPU_trig_preop, Src0->getType()); + return Builder.CreateCall2(F, Src0, Src1); + } + case R600::BI__builtin_amdgpu_rcp: + case R600::BI__builtin_amdgpu_rcpf: + return emitUnaryFPBuiltin(*this, E, Intrinsic::AMDGPU_rcp); + case R600::BI__builtin_amdgpu_rsq: + case R600::BI__builtin_amdgpu_rsqf: + return emitUnaryFPBuiltin(*this, E, Intrinsic::AMDGPU_rsq); + case R600::BI__builtin_amdgpu_rsq_clamped: + case R600::BI__builtin_amdgpu_rsq_clampedf: + return emitUnaryFPBuiltin(*this, E, Intrinsic::AMDGPU_rsq_clamped); + default: + return nullptr; + } +} |