diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp')
-rw-r--r-- | contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp | 1666 |
1 files changed, 1236 insertions, 430 deletions
diff --git a/contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp b/contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp index 787ac53..a5fc531 100644 --- a/contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp +++ b/contrib/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" #include <sstream> using namespace clang; @@ -105,9 +106,8 @@ static Value *MakeBinaryAtomicValue(CodeGenFunction &CGF, llvm::Type *ValueType = Args[1]->getType(); Args[1] = EmitToInt(CGF, Args[1], T, IntType); - llvm::Value *Result = - CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1], - llvm::SequentiallyConsistent); + llvm::Value *Result = CGF.Builder.CreateAtomicRMW( + Kind, Args[0], Args[1], llvm::AtomicOrdering::SequentiallyConsistent); return EmitFromInt(CGF, Result, T, ValueType); } @@ -167,9 +167,8 @@ static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF, Args[1] = EmitToInt(CGF, Args[1], T, IntType); Args[0] = CGF.Builder.CreateBitCast(DestPtr, IntPtrType); - llvm::Value *Result = - CGF.Builder.CreateAtomicRMW(Kind, Args[0], Args[1], - llvm::SequentiallyConsistent); + llvm::Value *Result = CGF.Builder.CreateAtomicRMW( + Kind, Args[0], Args[1], llvm::AtomicOrdering::SequentiallyConsistent); Result = CGF.Builder.CreateBinOp(Op, Result, Args[1]); if (Invert) Result = CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result, @@ -206,9 +205,9 @@ static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E, Args[1] = EmitToInt(CGF, Args[1], T, IntType); Args[2] = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType); - Value *Pair = CGF.Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2], - llvm::SequentiallyConsistent, - llvm::SequentiallyConsistent); + Value *Pair = CGF.Builder.CreateAtomicCmpXchg( + Args[0], Args[1], Args[2], llvm::AtomicOrdering::SequentiallyConsistent, + llvm::AtomicOrdering::SequentiallyConsistent); if (ReturnBool) // Extract boolean success flag and zext it to int. return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1), @@ -219,6 +218,51 @@ static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E, ValueType); } +// Emit a simple mangled intrinsic that has 1 argument and a return type +// matching the argument type. +static Value *emitUnaryBuiltin(CodeGenFunction &CGF, + const CallExpr *E, + unsigned IntrinsicID) { + llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); + + Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + return CGF.Builder.CreateCall(F, Src0); +} + +// Emit an intrinsic that has 2 operands of the same type as its result. +static Value *emitBinaryBuiltin(CodeGenFunction &CGF, + const CallExpr *E, + unsigned IntrinsicID) { + llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); + + Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + return CGF.Builder.CreateCall(F, { Src0, Src1 }); +} + +// Emit an intrinsic that has 3 operands of the same type as its result. +static Value *emitTernaryBuiltin(CodeGenFunction &CGF, + const CallExpr *E, + unsigned IntrinsicID) { + llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2)); + + Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 }); +} + +// Emit an intrinsic that has 1 float or double operand, and 1 integer. +static Value *emitFPIntBuiltin(CodeGenFunction &CGF, + const CallExpr *E, + unsigned IntrinsicID) { + llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); + + Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); + return CGF.Builder.CreateCall(F, {Src0, Src1}); +} + /// EmitFAbs - Emit a call to @llvm.fabs(). static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) { Value *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType()); @@ -248,8 +292,8 @@ static Value *EmitSignBit(CodeGenFunction &CGF, Value *V) { if (CGF.getTarget().isBigEndian()) { Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width); V = CGF.Builder.CreateLShr(V, ShiftCst); - } - // We are truncating value in order to extract the higher-order + } + // We are truncating value in order to extract the higher-order // double, which we will be using to extract the sign from. IntTy = llvm::IntegerType::get(C, Width); V = CGF.Builder.CreateTrunc(V, IntTy); @@ -288,6 +332,17 @@ static llvm::Value *EmitOverflowIntrinsic(CodeGenFunction &CGF, return CGF.Builder.CreateExtractValue(Tmp, 0); } +static Value *emitRangedBuiltin(CodeGenFunction &CGF, + unsigned IntrinsicID, + int low, int high) { + llvm::MDBuilder MDHelper(CGF.getLLVMContext()); + llvm::MDNode *RNode = MDHelper.createRange(APInt(32, low), APInt(32, high)); + Value *F = CGF.CGM.getIntrinsic(IntrinsicID, {}); + llvm::Instruction *Call = CGF.Builder.CreateCall(F); + Call->setMetadata(llvm::LLVMContext::MD_range, RNode); + return Call; +} + namespace { struct WidthAndSignedness { unsigned Width; @@ -465,9 +520,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI__builtin_fabs: case Builtin::BI__builtin_fabsf: case Builtin::BI__builtin_fabsl: { - Value *Arg1 = EmitScalarExpr(E->getArg(0)); - Value *Result = EmitFAbs(*this, Arg1); - return RValue::get(Result); + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs)); } case Builtin::BI__builtin_fmod: case Builtin::BI__builtin_fmodf: @@ -477,7 +530,51 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Value *Result = Builder.CreateFRem(Arg1, Arg2, "fmod"); return RValue::get(Result); } - + case Builtin::BI__builtin_copysign: + case Builtin::BI__builtin_copysignf: + case Builtin::BI__builtin_copysignl: { + return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign)); + } + case Builtin::BI__builtin_ceil: + case Builtin::BI__builtin_ceilf: + case Builtin::BI__builtin_ceill: { + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::ceil)); + } + case Builtin::BI__builtin_floor: + case Builtin::BI__builtin_floorf: + case Builtin::BI__builtin_floorl: { + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::floor)); + } + case Builtin::BI__builtin_trunc: + case Builtin::BI__builtin_truncf: + case Builtin::BI__builtin_truncl: { + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::trunc)); + } + case Builtin::BI__builtin_rint: + case Builtin::BI__builtin_rintf: + case Builtin::BI__builtin_rintl: { + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::rint)); + } + case Builtin::BI__builtin_nearbyint: + case Builtin::BI__builtin_nearbyintf: + case Builtin::BI__builtin_nearbyintl: { + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::nearbyint)); + } + case Builtin::BI__builtin_round: + case Builtin::BI__builtin_roundf: + case Builtin::BI__builtin_roundl: { + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::round)); + } + case Builtin::BI__builtin_fmin: + case Builtin::BI__builtin_fminf: + case Builtin::BI__builtin_fminl: { + return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::minnum)); + } + case Builtin::BI__builtin_fmax: + case Builtin::BI__builtin_fmaxf: + case Builtin::BI__builtin_fmaxl: { + return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::maxnum)); + } case Builtin::BI__builtin_conj: case Builtin::BI__builtin_conjf: case Builtin::BI__builtin_conjl: { @@ -645,10 +742,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI__builtin_bswap16: case Builtin::BI__builtin_bswap32: case Builtin::BI__builtin_bswap64: { - Value *ArgValue = EmitScalarExpr(E->getArg(0)); - llvm::Type *ArgType = ArgValue->getType(); - Value *F = CGM.getIntrinsic(Intrinsic::bswap, ArgType); - return RValue::get(Builder.CreateCall(F, ArgValue)); + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bswap)); + } + case Builtin::BI__builtin_bitreverse8: + case Builtin::BI__builtin_bitreverse16: + case Builtin::BI__builtin_bitreverse32: + case Builtin::BI__builtin_bitreverse64: { + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bitreverse)); } case Builtin::BI__builtin_object_size: { unsigned Type = @@ -751,13 +851,19 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType()))); } - case Builtin::BI__builtin_isinf: { - // isinf(x) --> fabs(x) == infinity + case Builtin::BI__builtin_isinf: + case Builtin::BI__builtin_isfinite: { + // isinf(x) --> fabs(x) == infinity + // isfinite(x) --> fabs(x) != infinity + // x != NaN via the ordered compare in either case. Value *V = EmitScalarExpr(E->getArg(0)); - V = EmitFAbs(*this, V); - - V = Builder.CreateFCmpOEQ(V, ConstantFP::getInfinity(V->getType()),"isinf"); - return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType()))); + Value *Fabs = EmitFAbs(*this, V); + Constant *Infinity = ConstantFP::getInfinity(V->getType()); + CmpInst::Predicate Pred = (BuiltinID == Builtin::BI__builtin_isinf) + ? CmpInst::FCMP_OEQ + : CmpInst::FCMP_ONE; + Value *FCmp = Builder.CreateFCmp(Pred, Fabs, Infinity, "cmpinf"); + return RValue::get(Builder.CreateZExt(FCmp, ConvertType(E->getType()))); } case Builtin::BI__builtin_isinf_sign: { @@ -795,19 +901,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType()))); } - case Builtin::BI__builtin_isfinite: { - // isfinite(x) --> x == x && fabs(x) != infinity; - Value *V = EmitScalarExpr(E->getArg(0)); - Value *Eq = Builder.CreateFCmpOEQ(V, V, "iseq"); - - Value *Abs = EmitFAbs(*this, V); - Value *IsNotInf = - Builder.CreateFCmpUNE(Abs, ConstantFP::getInfinity(V->getType()),"isinf"); - - V = Builder.CreateAnd(Eq, IsNotInf, "and"); - return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType()))); - } - case Builtin::BI__builtin_fpclassify: { Value *V = EmitScalarExpr(E->getArg(5)); llvm::Type *Ty = ConvertType(E->getArg(5)->getType()); @@ -1258,7 +1351,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, llvm::StoreInst *Store = Builder.CreateAlignedStore(llvm::Constant::getNullValue(ITy), Ptr, StoreSize); - Store->setAtomic(llvm::Release); + Store->setAtomic(llvm::AtomicOrdering::Release); return RValue::get(nullptr); } @@ -1270,7 +1363,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, // any way to safely use it... but in practice, it mostly works // to use it with non-atomic loads and stores to get acquire/release // semantics. - Builder.CreateFence(llvm::SequentiallyConsistent); + Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent); return RValue::get(nullptr); } @@ -1294,9 +1387,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Args.add(RValue::get(llvm::Constant::getNullValue(VoidPtrTy)), getContext().VoidPtrTy); const CGFunctionInfo &FuncInfo = - CGM.getTypes().arrangeFreeFunctionCall(E->getType(), Args, - FunctionType::ExtInfo(), - RequiredArgs::All); + CGM.getTypes().arrangeBuiltinFunctionCall(E->getType(), Args); llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo); llvm::Constant *Func = CGM.CreateRuntimeFunction(FTy, LibCallName); return EmitCall(FuncInfo, Func, ReturnValueSlot(), Args); @@ -1320,30 +1411,27 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, switch (ord) { case 0: // memory_order_relaxed default: // invalid order - Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, - Ptr, NewVal, - llvm::Monotonic); + Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal, + llvm::AtomicOrdering::Monotonic); break; - case 1: // memory_order_consume - case 2: // memory_order_acquire - Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, - Ptr, NewVal, - llvm::Acquire); + case 1: // memory_order_consume + case 2: // memory_order_acquire + Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal, + llvm::AtomicOrdering::Acquire); break; - case 3: // memory_order_release - Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, - Ptr, NewVal, - llvm::Release); + case 3: // memory_order_release + Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal, + llvm::AtomicOrdering::Release); break; - case 4: // memory_order_acq_rel - Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, - Ptr, NewVal, - llvm::AcquireRelease); + case 4: // memory_order_acq_rel + + Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal, + llvm::AtomicOrdering::AcquireRelease); break; - case 5: // memory_order_seq_cst - Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, - Ptr, NewVal, - llvm::SequentiallyConsistent); + case 5: // memory_order_seq_cst + Result = Builder.CreateAtomicRMW( + llvm::AtomicRMWInst::Xchg, Ptr, NewVal, + llvm::AtomicOrdering::SequentiallyConsistent); break; } Result->setVolatile(Volatile); @@ -1360,9 +1448,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, createBasicBlock("seqcst", CurFn) }; llvm::AtomicOrdering Orders[5] = { - llvm::Monotonic, llvm::Acquire, llvm::Release, - llvm::AcquireRelease, llvm::SequentiallyConsistent - }; + llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire, + llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease, + llvm::AtomicOrdering::SequentiallyConsistent}; Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false); llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]); @@ -1406,13 +1494,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, switch (ord) { case 0: // memory_order_relaxed default: // invalid order - Store->setOrdering(llvm::Monotonic); + Store->setOrdering(llvm::AtomicOrdering::Monotonic); break; case 3: // memory_order_release - Store->setOrdering(llvm::Release); + Store->setOrdering(llvm::AtomicOrdering::Release); break; case 5: // memory_order_seq_cst - Store->setOrdering(llvm::SequentiallyConsistent); + Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent); break; } return RValue::get(nullptr); @@ -1426,8 +1514,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, createBasicBlock("seqcst", CurFn) }; llvm::AtomicOrdering Orders[3] = { - llvm::Monotonic, llvm::Release, llvm::SequentiallyConsistent - }; + llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release, + llvm::AtomicOrdering::SequentiallyConsistent}; Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false); llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]); @@ -1466,16 +1554,17 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, break; case 1: // memory_order_consume case 2: // memory_order_acquire - Builder.CreateFence(llvm::Acquire, Scope); + Builder.CreateFence(llvm::AtomicOrdering::Acquire, Scope); break; case 3: // memory_order_release - Builder.CreateFence(llvm::Release, Scope); + Builder.CreateFence(llvm::AtomicOrdering::Release, Scope); break; case 4: // memory_order_acq_rel - Builder.CreateFence(llvm::AcquireRelease, Scope); + Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, Scope); break; case 5: // memory_order_seq_cst - Builder.CreateFence(llvm::SequentiallyConsistent, Scope); + Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, + Scope); break; } return RValue::get(nullptr); @@ -1492,23 +1581,23 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB); Builder.SetInsertPoint(AcquireBB); - Builder.CreateFence(llvm::Acquire, Scope); + Builder.CreateFence(llvm::AtomicOrdering::Acquire, Scope); Builder.CreateBr(ContBB); SI->addCase(Builder.getInt32(1), AcquireBB); SI->addCase(Builder.getInt32(2), AcquireBB); Builder.SetInsertPoint(ReleaseBB); - Builder.CreateFence(llvm::Release, Scope); + Builder.CreateFence(llvm::AtomicOrdering::Release, Scope); Builder.CreateBr(ContBB); SI->addCase(Builder.getInt32(3), ReleaseBB); Builder.SetInsertPoint(AcqRelBB); - Builder.CreateFence(llvm::AcquireRelease, Scope); + Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, Scope); Builder.CreateBr(ContBB); SI->addCase(Builder.getInt32(4), AcqRelBB); Builder.SetInsertPoint(SeqCstBB); - Builder.CreateFence(llvm::SequentiallyConsistent, Scope); + Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, Scope); Builder.CreateBr(ContBB); SI->addCase(Builder.getInt32(5), SeqCstBB); @@ -1794,7 +1883,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, break; } - + llvm::Value *Carry; llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry); Builder.CreateStore(Sum, SumOutPtr); @@ -1839,9 +1928,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, llvm::Value *Comparand = Builder.CreatePtrToInt(EmitScalarExpr(E->getArg(2)), IntType); - auto Result = Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange, - SequentiallyConsistent, - SequentiallyConsistent); + auto Result = + Builder.CreateAtomicCmpXchg(Destination, Comparand, Exchange, + AtomicOrdering::SequentiallyConsistent, + AtomicOrdering::SequentiallyConsistent); Result->setVolatile(true); return RValue::get(Builder.CreateIntToPtr(Builder.CreateExtractValue(Result, @@ -1853,44 +1943,47 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(1)), - SequentiallyConsistent, - SequentiallyConsistent); + AtomicOrdering::SequentiallyConsistent, + AtomicOrdering::SequentiallyConsistent); CXI->setVolatile(true); return RValue::get(Builder.CreateExtractValue(CXI, 0)); } case Builtin::BI_InterlockedIncrement: { + llvm::Type *IntTy = ConvertType(E->getType()); AtomicRMWInst *RMWI = Builder.CreateAtomicRMW( AtomicRMWInst::Add, EmitScalarExpr(E->getArg(0)), - ConstantInt::get(Int32Ty, 1), - llvm::SequentiallyConsistent); + ConstantInt::get(IntTy, 1), + llvm::AtomicOrdering::SequentiallyConsistent); RMWI->setVolatile(true); - return RValue::get(Builder.CreateAdd(RMWI, ConstantInt::get(Int32Ty, 1))); + return RValue::get(Builder.CreateAdd(RMWI, ConstantInt::get(IntTy, 1))); } case Builtin::BI_InterlockedDecrement: { + llvm::Type *IntTy = ConvertType(E->getType()); AtomicRMWInst *RMWI = Builder.CreateAtomicRMW( AtomicRMWInst::Sub, EmitScalarExpr(E->getArg(0)), - ConstantInt::get(Int32Ty, 1), - llvm::SequentiallyConsistent); + ConstantInt::get(IntTy, 1), + llvm::AtomicOrdering::SequentiallyConsistent); RMWI->setVolatile(true); - return RValue::get(Builder.CreateSub(RMWI, ConstantInt::get(Int32Ty, 1))); + return RValue::get(Builder.CreateSub(RMWI, ConstantInt::get(IntTy, 1))); } case Builtin::BI_InterlockedExchangeAdd: { AtomicRMWInst *RMWI = Builder.CreateAtomicRMW( AtomicRMWInst::Add, EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), - llvm::SequentiallyConsistent); + llvm::AtomicOrdering::SequentiallyConsistent); RMWI->setVolatile(true); return RValue::get(RMWI); } case Builtin::BI__readfsdword: { + llvm::Type *IntTy = ConvertType(E->getType()); Value *IntToPtr = Builder.CreateIntToPtr(EmitScalarExpr(E->getArg(0)), - llvm::PointerType::get(CGM.Int32Ty, 257)); + llvm::PointerType::get(IntTy, 257)); LoadInst *Load = - Builder.CreateAlignedLoad(IntToPtr, /*Align=*/4, /*isVolatile=*/true); + Builder.CreateDefaultAlignedLoad(IntToPtr, /*isVolatile=*/true); return RValue::get(Load); } @@ -1963,6 +2056,323 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy)); break; } + + // OpenCL v2.0 s6.13.16.2, Built-in pipe read and write functions + case Builtin::BIread_pipe: + case Builtin::BIwrite_pipe: { + Value *Arg0 = EmitScalarExpr(E->getArg(0)), + *Arg1 = EmitScalarExpr(E->getArg(1)); + + // Type of the generic packet parameter. + unsigned GenericAS = + getContext().getTargetAddressSpace(LangAS::opencl_generic); + llvm::Type *I8PTy = llvm::PointerType::get( + llvm::Type::getInt8Ty(getLLVMContext()), GenericAS); + + // Testing which overloaded version we should generate the call for. + if (2U == E->getNumArgs()) { + const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2" + : "__write_pipe_2"; + // Creating a generic function type to be able to call with any builtin or + // user defined type. + llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy}; + llvm::FunctionType *FTy = llvm::FunctionType::get( + Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false); + Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy); + return RValue::get(Builder.CreateCall( + CGM.CreateRuntimeFunction(FTy, Name), {Arg0, BCast})); + } else { + assert(4 == E->getNumArgs() && + "Illegal number of parameters to pipe function"); + const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4" + : "__write_pipe_4"; + + llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, I8PTy}; + Value *Arg2 = EmitScalarExpr(E->getArg(2)), + *Arg3 = EmitScalarExpr(E->getArg(3)); + llvm::FunctionType *FTy = llvm::FunctionType::get( + Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false); + Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy); + // We know the third argument is an integer type, but we may need to cast + // it to i32. + if (Arg2->getType() != Int32Ty) + Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty); + return RValue::get(Builder.CreateCall( + CGM.CreateRuntimeFunction(FTy, Name), {Arg0, Arg1, Arg2, BCast})); + } + } + // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write + // functions + case Builtin::BIreserve_read_pipe: + case Builtin::BIreserve_write_pipe: + case Builtin::BIwork_group_reserve_read_pipe: + case Builtin::BIwork_group_reserve_write_pipe: + case Builtin::BIsub_group_reserve_read_pipe: + case Builtin::BIsub_group_reserve_write_pipe: { + // Composing the mangled name for the function. + const char *Name; + if (BuiltinID == Builtin::BIreserve_read_pipe) + Name = "__reserve_read_pipe"; + else if (BuiltinID == Builtin::BIreserve_write_pipe) + Name = "__reserve_write_pipe"; + else if (BuiltinID == Builtin::BIwork_group_reserve_read_pipe) + Name = "__work_group_reserve_read_pipe"; + else if (BuiltinID == Builtin::BIwork_group_reserve_write_pipe) + Name = "__work_group_reserve_write_pipe"; + else if (BuiltinID == Builtin::BIsub_group_reserve_read_pipe) + Name = "__sub_group_reserve_read_pipe"; + else + Name = "__sub_group_reserve_write_pipe"; + + Value *Arg0 = EmitScalarExpr(E->getArg(0)), + *Arg1 = EmitScalarExpr(E->getArg(1)); + llvm::Type *ReservedIDTy = ConvertType(getContext().OCLReserveIDTy); + + // Building the generic function prototype. + llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty}; + llvm::FunctionType *FTy = llvm::FunctionType::get( + ReservedIDTy, llvm::ArrayRef<llvm::Type *>(ArgTys), false); + // We know the second argument is an integer type, but we may need to cast + // it to i32. + if (Arg1->getType() != Int32Ty) + Arg1 = Builder.CreateZExtOrTrunc(Arg1, Int32Ty); + return RValue::get( + Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), {Arg0, Arg1})); + } + // OpenCL v2.0 s6.13.16, s9.17.3.5 - Built-in pipe commit read and write + // functions + case Builtin::BIcommit_read_pipe: + case Builtin::BIcommit_write_pipe: + case Builtin::BIwork_group_commit_read_pipe: + case Builtin::BIwork_group_commit_write_pipe: + case Builtin::BIsub_group_commit_read_pipe: + case Builtin::BIsub_group_commit_write_pipe: { + const char *Name; + if (BuiltinID == Builtin::BIcommit_read_pipe) + Name = "__commit_read_pipe"; + else if (BuiltinID == Builtin::BIcommit_write_pipe) + Name = "__commit_write_pipe"; + else if (BuiltinID == Builtin::BIwork_group_commit_read_pipe) + Name = "__work_group_commit_read_pipe"; + else if (BuiltinID == Builtin::BIwork_group_commit_write_pipe) + Name = "__work_group_commit_write_pipe"; + else if (BuiltinID == Builtin::BIsub_group_commit_read_pipe) + Name = "__sub_group_commit_read_pipe"; + else + Name = "__sub_group_commit_write_pipe"; + + Value *Arg0 = EmitScalarExpr(E->getArg(0)), + *Arg1 = EmitScalarExpr(E->getArg(1)); + + // Building the generic function prototype. + llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType()}; + llvm::FunctionType *FTy = + llvm::FunctionType::get(llvm::Type::getVoidTy(getLLVMContext()), + llvm::ArrayRef<llvm::Type *>(ArgTys), false); + + return RValue::get( + Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), {Arg0, Arg1})); + } + // OpenCL v2.0 s6.13.16.4 Built-in pipe query functions + case Builtin::BIget_pipe_num_packets: + case Builtin::BIget_pipe_max_packets: { + const char *Name; + if (BuiltinID == Builtin::BIget_pipe_num_packets) + Name = "__get_pipe_num_packets"; + else + Name = "__get_pipe_max_packets"; + + // Building the generic function prototype. + Value *Arg0 = EmitScalarExpr(E->getArg(0)); + llvm::Type *ArgTys[] = {Arg0->getType()}; + llvm::FunctionType *FTy = llvm::FunctionType::get( + Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false); + + return RValue::get( + Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), {Arg0})); + } + + // OpenCL v2.0 s6.13.9 - Address space qualifier functions. + case Builtin::BIto_global: + case Builtin::BIto_local: + case Builtin::BIto_private: { + auto Arg0 = EmitScalarExpr(E->getArg(0)); + auto NewArgT = llvm::PointerType::get(Int8Ty, + CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic)); + auto NewRetT = llvm::PointerType::get(Int8Ty, + CGM.getContext().getTargetAddressSpace( + E->getType()->getPointeeType().getAddressSpace())); + auto FTy = llvm::FunctionType::get(NewRetT, {NewArgT}, false); + llvm::Value *NewArg; + if (Arg0->getType()->getPointerAddressSpace() != + NewArgT->getPointerAddressSpace()) + NewArg = Builder.CreateAddrSpaceCast(Arg0, NewArgT); + else + NewArg = Builder.CreateBitOrPointerCast(Arg0, NewArgT); + auto NewName = std::string("__") + E->getDirectCallee()->getName().str(); + auto NewCall = + Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, NewName), {NewArg}); + return RValue::get(Builder.CreateBitOrPointerCast(NewCall, + ConvertType(E->getType()))); + } + + // OpenCL v2.0, s6.13.17 - Enqueue kernel function. + // It contains four different overload formats specified in Table 6.13.17.1. + case Builtin::BIenqueue_kernel: { + StringRef Name; // Generated function call name + unsigned NumArgs = E->getNumArgs(); + + llvm::Type *QueueTy = ConvertType(getContext().OCLQueueTy); + llvm::Type *RangeTy = ConvertType(getContext().OCLNDRangeTy); + + llvm::Value *Queue = EmitScalarExpr(E->getArg(0)); + llvm::Value *Flags = EmitScalarExpr(E->getArg(1)); + llvm::Value *Range = EmitScalarExpr(E->getArg(2)); + + if (NumArgs == 4) { + // The most basic form of the call with parameters: + // queue_t, kernel_enqueue_flags_t, ndrange_t, block(void) + Name = "__enqueue_kernel_basic"; + llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, Int8PtrTy}; + llvm::FunctionType *FTy = llvm::FunctionType::get( + Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys, 4), false); + + llvm::Value *Block = + Builder.CreateBitCast(EmitScalarExpr(E->getArg(3)), Int8PtrTy); + + return RValue::get(Builder.CreateCall( + CGM.CreateRuntimeFunction(FTy, Name), {Queue, Flags, Range, Block})); + } + assert(NumArgs >= 5 && "Invalid enqueue_kernel signature"); + + // Could have events and/or vaargs. + if (E->getArg(3)->getType()->isBlockPointerType()) { + // No events passed, but has variadic arguments. + Name = "__enqueue_kernel_vaargs"; + llvm::Value *Block = + Builder.CreateBitCast(EmitScalarExpr(E->getArg(3)), Int8PtrTy); + // Create a vector of the arguments, as well as a constant value to + // express to the runtime the number of variadic arguments. + std::vector<llvm::Value *> Args = {Queue, Flags, Range, Block, + ConstantInt::get(IntTy, NumArgs - 4)}; + std::vector<llvm::Type *> ArgTys = {QueueTy, IntTy, RangeTy, Int8PtrTy, + IntTy}; + + // Add the variadics. + for (unsigned I = 4; I < NumArgs; ++I) { + llvm::Value *ArgSize = EmitScalarExpr(E->getArg(I)); + unsigned TypeSizeInBytes = + getContext() + .getTypeSizeInChars(E->getArg(I)->getType()) + .getQuantity(); + Args.push_back(TypeSizeInBytes < 4 + ? Builder.CreateZExt(ArgSize, Int32Ty) + : ArgSize); + } + + llvm::FunctionType *FTy = llvm::FunctionType::get( + Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), true); + return RValue::get( + Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), + llvm::ArrayRef<llvm::Value *>(Args))); + } + // Any calls now have event arguments passed. + if (NumArgs >= 7) { + llvm::Type *EventTy = ConvertType(getContext().OCLClkEventTy); + unsigned AS4 = + E->getArg(4)->getType()->isArrayType() + ? E->getArg(4)->getType().getAddressSpace() + : E->getArg(4)->getType()->getPointeeType().getAddressSpace(); + llvm::Type *EventPtrAS4Ty = + EventTy->getPointerTo(CGM.getContext().getTargetAddressSpace(AS4)); + unsigned AS5 = + E->getArg(5)->getType()->getPointeeType().getAddressSpace(); + llvm::Type *EventPtrAS5Ty = + EventTy->getPointerTo(CGM.getContext().getTargetAddressSpace(AS5)); + + llvm::Value *NumEvents = EmitScalarExpr(E->getArg(3)); + llvm::Value *EventList = + E->getArg(4)->getType()->isArrayType() + ? EmitArrayToPointerDecay(E->getArg(4)).getPointer() + : EmitScalarExpr(E->getArg(4)); + llvm::Value *ClkEvent = EmitScalarExpr(E->getArg(5)); + llvm::Value *Block = + Builder.CreateBitCast(EmitScalarExpr(E->getArg(6)), Int8PtrTy); + + std::vector<llvm::Type *> ArgTys = { + QueueTy, Int32Ty, RangeTy, Int32Ty, + EventPtrAS4Ty, EventPtrAS5Ty, Int8PtrTy}; + std::vector<llvm::Value *> Args = {Queue, Flags, Range, NumEvents, + EventList, ClkEvent, Block}; + + if (NumArgs == 7) { + // Has events but no variadics. + Name = "__enqueue_kernel_basic_events"; + llvm::FunctionType *FTy = llvm::FunctionType::get( + Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false); + return RValue::get( + Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), + llvm::ArrayRef<llvm::Value *>(Args))); + } + // Has event info and variadics + // Pass the number of variadics to the runtime function too. + Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7)); + ArgTys.push_back(Int32Ty); + Name = "__enqueue_kernel_events_vaargs"; + + // Add the variadics. + for (unsigned I = 7; I < NumArgs; ++I) { + llvm::Value *ArgSize = EmitScalarExpr(E->getArg(I)); + unsigned TypeSizeInBytes = + getContext() + .getTypeSizeInChars(E->getArg(I)->getType()) + .getQuantity(); + Args.push_back(TypeSizeInBytes < 4 + ? Builder.CreateZExt(ArgSize, Int32Ty) + : ArgSize); + } + llvm::FunctionType *FTy = llvm::FunctionType::get( + Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), true); + return RValue::get( + Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), + llvm::ArrayRef<llvm::Value *>(Args))); + } + } + // OpenCL v2.0 s6.13.17.6 - Kernel query functions need bitcast of block + // parameter. + case Builtin::BIget_kernel_work_group_size: { + Value *Arg = EmitScalarExpr(E->getArg(0)); + Arg = Builder.CreateBitCast(Arg, Int8PtrTy); + return RValue::get( + Builder.CreateCall(CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, Int8PtrTy, false), + "__get_kernel_work_group_size_impl"), + Arg)); + } + case Builtin::BIget_kernel_preferred_work_group_size_multiple: { + Value *Arg = EmitScalarExpr(E->getArg(0)); + Arg = Builder.CreateBitCast(Arg, Int8PtrTy); + return RValue::get(Builder.CreateCall( + CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, Int8PtrTy, false), + "__get_kernel_preferred_work_group_multiple_impl"), + Arg)); + } + case Builtin::BIprintf: + if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) + return EmitCUDADevicePrintfCallExpr(E, ReturnValue); + break; + case Builtin::BI__builtin_canonicalize: + case Builtin::BI__builtin_canonicalizef: + case Builtin::BI__builtin_canonicalizel: + return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::canonicalize)); + + case Builtin::BI__builtin_thread_pointer: { + if (!getContext().getTargetInfo().isTLSSupported()) + CGM.ErrorUnsupported(E, "__builtin_thread_pointer"); + // Fall through - it's already mapped to the intrinsic by GCCBuiltin. + break; + } } // If this is an alias for a lib function (e.g. __builtin_sin), emit @@ -2155,7 +2565,7 @@ static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF, } Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) { - unsigned nElts = cast<llvm::VectorType>(V->getType())->getNumElements(); + unsigned nElts = V->getType()->getVectorNumElements(); Value* SV = llvm::ConstantVector::getSplat(nElts, C); return Builder.CreateShuffleVector(V, V, SV, "lane"); } @@ -3073,14 +3483,13 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vext_v: case NEON::BI__builtin_neon_vextq_v: { int CV = cast<ConstantInt>(Ops[2])->getSExtValue(); - SmallVector<Constant*, 16> Indices; + SmallVector<uint32_t, 16> Indices; for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) - Indices.push_back(ConstantInt::get(Int32Ty, i+CV)); + Indices.push_back(i+CV); Ops[0] = Builder.CreateBitCast(Ops[0], Ty); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); - Value *SV = llvm::ConstantVector::get(Indices); - return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vext"); + return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext"); } case NEON::BI__builtin_neon_vfma_v: case NEON::BI__builtin_neon_vfmaq_v: { @@ -3278,14 +3687,13 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( Value *SV = nullptr; for (unsigned vi = 0; vi != 2; ++vi) { - SmallVector<Constant*, 16> Indices; + SmallVector<uint32_t, 16> Indices; for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { - Indices.push_back(Builder.getInt32(i+vi)); - Indices.push_back(Builder.getInt32(i+e+vi)); + Indices.push_back(i+vi); + Indices.push_back(i+e+vi); } Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); - SV = llvm::ConstantVector::get(Indices); - SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vtrn"); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn"); SV = Builder.CreateDefaultAlignedStore(SV, Addr); } return SV; @@ -3307,13 +3715,12 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( Value *SV = nullptr; for (unsigned vi = 0; vi != 2; ++vi) { - SmallVector<Constant*, 16> Indices; + SmallVector<uint32_t, 16> Indices; for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) - Indices.push_back(ConstantInt::get(Int32Ty, 2*i+vi)); + Indices.push_back(2*i+vi); Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); - SV = llvm::ConstantVector::get(Indices); - SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vuzp"); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp"); SV = Builder.CreateDefaultAlignedStore(SV, Addr); } return SV; @@ -3326,14 +3733,13 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( Value *SV = nullptr; for (unsigned vi = 0; vi != 2; ++vi) { - SmallVector<Constant*, 16> Indices; + SmallVector<uint32_t, 16> Indices; for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { - Indices.push_back(ConstantInt::get(Int32Ty, (i + vi*e) >> 1)); - Indices.push_back(ConstantInt::get(Int32Ty, ((i + vi*e) >> 1)+e)); + Indices.push_back((i + vi*e) >> 1); + Indices.push_back(((i + vi*e) >> 1)+e); } Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); - SV = llvm::ConstantVector::get(Indices); - SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vzip"); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip"); SV = Builder.CreateDefaultAlignedStore(SV, Addr); } return SV; @@ -3381,19 +3787,19 @@ static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops, if (ExtOp) TblOps.push_back(ExtOp); - // Build a vector containing sequential number like (0, 1, 2, ..., 15) - SmallVector<Constant*, 16> Indices; + // Build a vector containing sequential number like (0, 1, 2, ..., 15) + SmallVector<uint32_t, 16> Indices; llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType()); for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) { - Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i)); - Indices.push_back(ConstantInt::get(CGF.Int32Ty, 2*i+1)); + Indices.push_back(2*i); + Indices.push_back(2*i+1); } - Value *SV = llvm::ConstantVector::get(Indices); int PairPos = 0, End = Ops.size() - 1; while (PairPos < End) { TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos], - Ops[PairPos+1], SV, Name)); + Ops[PairPos+1], Indices, + Name)); PairPos += 2; } @@ -3402,13 +3808,13 @@ static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops, if (PairPos == End) { Value *ZeroTbl = ConstantAggregateZero::get(TblTy); TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos], - ZeroTbl, SV, Name)); + ZeroTbl, Indices, Name)); } Function *TblF; TblOps.push_back(IndexOp); TblF = CGF.CGM.getIntrinsic(IntID, ResTy); - + return CGF.EmitNeonCall(TblF, TblOps, Name); } @@ -3452,7 +3858,9 @@ Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) { static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF, const CallExpr *E, llvm::Type *RegisterType, - llvm::Type *ValueType, bool IsRead) { + llvm::Type *ValueType, + bool IsRead, + StringRef SysReg = "") { // write and register intrinsics only support 32 and 64 bit operations. assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64)) && "Unsupported size for register."); @@ -3461,8 +3869,10 @@ static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF, CodeGen::CodeGenModule &CGM = CGF.CGM; LLVMContext &Context = CGM.getLLVMContext(); - const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts(); - StringRef SysReg = cast<StringLiteral>(SysRegStrExpr)->getString(); + if (SysReg.empty()) { + const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts(); + SysReg = cast<StringLiteral>(SysRegStrExpr)->getString(); + } llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) }; llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops); @@ -3602,6 +4012,74 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops); } + if (BuiltinID == ARM::BI__builtin_arm_mcrr || + BuiltinID == ARM::BI__builtin_arm_mcrr2) { + Function *F; + + switch (BuiltinID) { + default: llvm_unreachable("unexpected builtin"); + case ARM::BI__builtin_arm_mcrr: + F = CGM.getIntrinsic(Intrinsic::arm_mcrr); + break; + case ARM::BI__builtin_arm_mcrr2: + F = CGM.getIntrinsic(Intrinsic::arm_mcrr2); + break; + } + + // MCRR{2} instruction has 5 operands but + // the intrinsic has 4 because Rt and Rt2 + // are represented as a single unsigned 64 + // bit integer in the intrinsic definition + // but internally it's represented as 2 32 + // bit integers. + + Value *Coproc = EmitScalarExpr(E->getArg(0)); + Value *Opc1 = EmitScalarExpr(E->getArg(1)); + Value *RtAndRt2 = EmitScalarExpr(E->getArg(2)); + Value *CRm = EmitScalarExpr(E->getArg(3)); + + Value *C1 = llvm::ConstantInt::get(Int64Ty, 32); + Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty); + Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1); + Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty); + + return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm}); + } + + if (BuiltinID == ARM::BI__builtin_arm_mrrc || + BuiltinID == ARM::BI__builtin_arm_mrrc2) { + Function *F; + + switch (BuiltinID) { + default: llvm_unreachable("unexpected builtin"); + case ARM::BI__builtin_arm_mrrc: + F = CGM.getIntrinsic(Intrinsic::arm_mrrc); + break; + case ARM::BI__builtin_arm_mrrc2: + F = CGM.getIntrinsic(Intrinsic::arm_mrrc2); + break; + } + + Value *Coproc = EmitScalarExpr(E->getArg(0)); + Value *Opc1 = EmitScalarExpr(E->getArg(1)); + Value *CRm = EmitScalarExpr(E->getArg(2)); + Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm}); + + // Returns an unsigned 64 bit integer, represented + // as two 32 bit integers. + + Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1); + Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0); + Rt = Builder.CreateZExt(Rt, Int64Ty); + Rt1 = Builder.CreateZExt(Rt1, Int64Ty); + + Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32); + RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true); + RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1); + + return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType())); + } + if (BuiltinID == ARM::BI__builtin_arm_ldrexd || ((BuiltinID == ARM::BI__builtin_arm_ldrex || BuiltinID == ARM::BI__builtin_arm_ldaex) && @@ -3914,7 +4392,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, // the first argument, but the LLVM intrinsic expects it as the third one. case ARM::BI_MoveToCoprocessor: case ARM::BI_MoveToCoprocessor2: { - Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ? + Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ? Intrinsic::arm_mcr : Intrinsic::arm_mcr2); return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0], Ops[3], Ops[4], Ops[5]}); @@ -4478,11 +4956,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F); } - if (BuiltinID == AArch64::BI__builtin_thread_pointer) { - Function *F = CGM.getIntrinsic(Intrinsic::aarch64_thread_pointer); - return Builder.CreateCall(F); - } - // CRC32 Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic; switch (BuiltinID) { @@ -5150,22 +5623,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0]}); } - case NEON::BI__builtin_neon_vfms_v: - case NEON::BI__builtin_neon_vfmsq_v: { // Only used for FP types - // FIXME: probably remove when we no longer support aarch64_simd.h - // (arm_neon.h delegates to vfma). - - // The ARM builtins (and instructions) have the addend as the first - // operand, but the 'fma' intrinsics have it last. Swap it around here. - Value *Subtrahend = Ops[0]; - Value *Multiplicand = Ops[2]; - Ops[0] = Multiplicand; - Ops[2] = Subtrahend; - Ops[1] = Builder.CreateBitCast(Ops[1], VTy); - Ops[1] = Builder.CreateFNeg(Ops[1]); - Int = Intrinsic::fma; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmls"); - } case NEON::BI__builtin_neon_vmull_v: // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull; @@ -5988,14 +6445,13 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Value *SV = nullptr; for (unsigned vi = 0; vi != 2; ++vi) { - SmallVector<Constant*, 16> Indices; + SmallVector<uint32_t, 16> Indices; for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { - Indices.push_back(ConstantInt::get(Int32Ty, i+vi)); - Indices.push_back(ConstantInt::get(Int32Ty, i+e+vi)); + Indices.push_back(i+vi); + Indices.push_back(i+e+vi); } Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); - SV = llvm::ConstantVector::get(Indices); - SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vtrn"); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn"); SV = Builder.CreateDefaultAlignedStore(SV, Addr); } return SV; @@ -6008,13 +6464,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Value *SV = nullptr; for (unsigned vi = 0; vi != 2; ++vi) { - SmallVector<Constant*, 16> Indices; + SmallVector<uint32_t, 16> Indices; for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) - Indices.push_back(ConstantInt::get(Int32Ty, 2*i+vi)); + Indices.push_back(2*i+vi); Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); - SV = llvm::ConstantVector::get(Indices); - SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vuzp"); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp"); SV = Builder.CreateDefaultAlignedStore(SV, Addr); } return SV; @@ -6027,14 +6482,13 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Value *SV = nullptr; for (unsigned vi = 0; vi != 2; ++vi) { - SmallVector<Constant*, 16> Indices; + SmallVector<uint32_t, 16> Indices; for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) { - Indices.push_back(ConstantInt::get(Int32Ty, (i + vi*e) >> 1)); - Indices.push_back(ConstantInt::get(Int32Ty, ((i + vi*e) >> 1)+e)); + Indices.push_back((i + vi*e) >> 1); + Indices.push_back(((i + vi*e) >> 1)+e); } Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi); - SV = llvm::ConstantVector::get(Indices); - SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vzip"); + SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip"); SV = Builder.CreateDefaultAlignedStore(SV, Addr); } return SV; @@ -6110,6 +6564,118 @@ BuildVector(ArrayRef<llvm::Value*> Ops) { return Result; } +// Convert the mask from an integer type to a vector of i1. +static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, + unsigned NumElts) { + + llvm::VectorType *MaskTy = llvm::VectorType::get(CGF.Builder.getInt1Ty(), + cast<IntegerType>(Mask->getType())->getBitWidth()); + Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy); + + // If we have less than 8 elements, then the starting mask was an i8 and + // we need to extract down to the right number of elements. + if (NumElts < 8) { + uint32_t Indices[4]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i; + MaskVec = CGF.Builder.CreateShuffleVector(MaskVec, MaskVec, + makeArrayRef(Indices, NumElts), + "extract"); + } + return MaskVec; +} + +static Value *EmitX86MaskedStore(CodeGenFunction &CGF, + SmallVectorImpl<Value *> &Ops, + unsigned Align) { + // Cast the pointer to right type. + Ops[0] = CGF.Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(Ops[1]->getType())); + + // If the mask is all ones just emit a regular store. + if (const auto *C = dyn_cast<Constant>(Ops[2])) + if (C->isAllOnesValue()) + return CGF.Builder.CreateAlignedStore(Ops[1], Ops[0], Align); + + Value *MaskVec = getMaskVecValue(CGF, Ops[2], + Ops[1]->getType()->getVectorNumElements()); + + return CGF.Builder.CreateMaskedStore(Ops[1], Ops[0], Align, MaskVec); +} + +static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, + SmallVectorImpl<Value *> &Ops, unsigned Align) { + // Cast the pointer to right type. + Ops[0] = CGF.Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(Ops[1]->getType())); + + // If the mask is all ones just emit a regular store. + if (const auto *C = dyn_cast<Constant>(Ops[2])) + if (C->isAllOnesValue()) + return CGF.Builder.CreateAlignedLoad(Ops[0], Align); + + Value *MaskVec = getMaskVecValue(CGF, Ops[2], + Ops[1]->getType()->getVectorNumElements()); + + return CGF.Builder.CreateMaskedLoad(Ops[0], Align, MaskVec, Ops[1]); +} + +static Value *EmitX86Select(CodeGenFunction &CGF, + Value *Mask, Value *Op0, Value *Op1) { + + // If the mask is all ones just return first argument. + if (const auto *C = dyn_cast<Constant>(Mask)) + if (C->isAllOnesValue()) + return Op0; + + Mask = getMaskVecValue(CGF, Mask, Op0->getType()->getVectorNumElements()); + + return CGF.Builder.CreateSelect(Mask, Op0, Op1); +} + +static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC, + bool Signed, SmallVectorImpl<Value *> &Ops) { + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); + Value *Cmp; + + if (CC == 3) { + Cmp = Constant::getNullValue( + llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts)); + } else if (CC == 7) { + Cmp = Constant::getAllOnesValue( + llvm::VectorType::get(CGF.Builder.getInt1Ty(), NumElts)); + } else { + ICmpInst::Predicate Pred; + switch (CC) { + default: llvm_unreachable("Unknown condition code"); + case 0: Pred = ICmpInst::ICMP_EQ; break; + case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break; + case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break; + case 4: Pred = ICmpInst::ICMP_NE; break; + case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break; + case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break; + } + Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]); + } + + const auto *C = dyn_cast<Constant>(Ops.back()); + if (!C || !C->isAllOnesValue()) + Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, Ops.back(), NumElts)); + + if (NumElts < 8) { + uint32_t Indices[8]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i; + for (unsigned i = NumElts; i != 8; ++i) + Indices[i] = i % NumElts + NumElts; + Cmp = CGF.Builder.CreateShuffleVector( + Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices); + } + return CGF.Builder.CreateBitCast(Cmp, + IntegerType::get(CGF.getLLVMContext(), + std::max(NumElts, 8U))); +} + Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr *E) { if (BuiltinID == X86::BI__builtin_ms_va_start || @@ -6160,6 +6726,31 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result)); } + // These exist so that the builtin that takes an immediate can be bounds + // checked by clang to avoid passing bad immediates to the backend. Since + // AVX has a larger immediate than SSE we would need separate builtins to + // do the different bounds checking. Rather than create a clang specific + // SSE only builtin, this implements eight separate builtins to match gcc + // implementation. + auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) { + Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm)); + llvm::Function *F = CGM.getIntrinsic(ID); + return Builder.CreateCall(F, Ops); + }; + + // For the vector forms of FP comparisons, translate the builtins directly to + // IR. + // TODO: The builtins could be removed if the SSE header files used vector + // extension comparisons directly (vector ordered/unordered may need + // additional support via __builtin_isnan()). + auto getVectorFCmpIR = [this, &Ops](CmpInst::Predicate Pred) { + Value *Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]); + llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType()); + llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy); + Value *Sext = Builder.CreateSExt(Cmp, IntVecTy); + return Builder.CreateBitCast(Sext, FPVecTy); + }; + switch (BuiltinID) { default: return nullptr; case X86::BI__builtin_cpu_supports: { @@ -6188,6 +6779,16 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, AVX512F, BMI, BMI2, + AES, + PCLMUL, + AVX512VL, + AVX512BW, + AVX512DQ, + AVX512CD, + AVX512ER, + AVX512PF, + AVX512VBMI, + AVX512IFMA, MAX }; @@ -6198,6 +6799,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, .Case("sse", X86Features::SSE) .Case("sse2", X86Features::SSE2) .Case("sse3", X86Features::SSE3) + .Case("ssse3", X86Features::SSSE3) .Case("sse4.1", X86Features::SSE4_1) .Case("sse4.2", X86Features::SSE4_2) .Case("avx", X86Features::AVX) @@ -6209,6 +6811,16 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, .Case("avx512f", X86Features::AVX512F) .Case("bmi", X86Features::BMI) .Case("bmi2", X86Features::BMI2) + .Case("aes", X86Features::AES) + .Case("pclmul", X86Features::PCLMUL) + .Case("avx512vl", X86Features::AVX512VL) + .Case("avx512bw", X86Features::AVX512BW) + .Case("avx512dq", X86Features::AVX512DQ) + .Case("avx512cd", X86Features::AVX512CD) + .Case("avx512er", X86Features::AVX512ER) + .Case("avx512pf", X86Features::AVX512PF) + .Case("avx512vbmi", X86Features::AVX512VBMI) + .Case("avx512ifma", X86Features::AVX512IFMA) .Default(X86Features::MAX); assert(Feature != X86Features::MAX && "Invalid feature!"); @@ -6237,7 +6849,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // Check the value of the bit corresponding to the feature requested. Value *Bitset = Builder.CreateAnd( - Features, llvm::ConstantInt::get(Int32Ty, 1 << Feature)); + Features, llvm::ConstantInt::get(Int32Ty, 1ULL << Feature)); return Builder.CreateICmpNE(Bitset, llvm::ConstantInt::get(Int32Ty, 0)); } case X86::BI_mm_prefetch: { @@ -6312,6 +6924,78 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Ops.push_back(Mlo); return Builder.CreateCall(CGM.getIntrinsic(ID), Ops); } + case X86::BI__builtin_ia32_storedqudi128_mask: + case X86::BI__builtin_ia32_storedqusi128_mask: + case X86::BI__builtin_ia32_storedquhi128_mask: + case X86::BI__builtin_ia32_storedquqi128_mask: + case X86::BI__builtin_ia32_storeupd128_mask: + case X86::BI__builtin_ia32_storeups128_mask: + case X86::BI__builtin_ia32_storedqudi256_mask: + case X86::BI__builtin_ia32_storedqusi256_mask: + case X86::BI__builtin_ia32_storedquhi256_mask: + case X86::BI__builtin_ia32_storedquqi256_mask: + case X86::BI__builtin_ia32_storeupd256_mask: + case X86::BI__builtin_ia32_storeups256_mask: + case X86::BI__builtin_ia32_storedqudi512_mask: + case X86::BI__builtin_ia32_storedqusi512_mask: + case X86::BI__builtin_ia32_storedquhi512_mask: + case X86::BI__builtin_ia32_storedquqi512_mask: + case X86::BI__builtin_ia32_storeupd512_mask: + case X86::BI__builtin_ia32_storeups512_mask: + return EmitX86MaskedStore(*this, Ops, 1); + + case X86::BI__builtin_ia32_movdqa32store128_mask: + case X86::BI__builtin_ia32_movdqa64store128_mask: + case X86::BI__builtin_ia32_storeaps128_mask: + case X86::BI__builtin_ia32_storeapd128_mask: + case X86::BI__builtin_ia32_movdqa32store256_mask: + case X86::BI__builtin_ia32_movdqa64store256_mask: + case X86::BI__builtin_ia32_storeaps256_mask: + case X86::BI__builtin_ia32_storeapd256_mask: + case X86::BI__builtin_ia32_movdqa32store512_mask: + case X86::BI__builtin_ia32_movdqa64store512_mask: + case X86::BI__builtin_ia32_storeaps512_mask: + case X86::BI__builtin_ia32_storeapd512_mask: { + unsigned Align = + getContext().getTypeAlignInChars(E->getArg(1)->getType()).getQuantity(); + return EmitX86MaskedStore(*this, Ops, Align); + } + case X86::BI__builtin_ia32_loadups128_mask: + case X86::BI__builtin_ia32_loadups256_mask: + case X86::BI__builtin_ia32_loadups512_mask: + case X86::BI__builtin_ia32_loadupd128_mask: + case X86::BI__builtin_ia32_loadupd256_mask: + case X86::BI__builtin_ia32_loadupd512_mask: + case X86::BI__builtin_ia32_loaddquqi128_mask: + case X86::BI__builtin_ia32_loaddquqi256_mask: + case X86::BI__builtin_ia32_loaddquqi512_mask: + case X86::BI__builtin_ia32_loaddquhi128_mask: + case X86::BI__builtin_ia32_loaddquhi256_mask: + case X86::BI__builtin_ia32_loaddquhi512_mask: + case X86::BI__builtin_ia32_loaddqusi128_mask: + case X86::BI__builtin_ia32_loaddqusi256_mask: + case X86::BI__builtin_ia32_loaddqusi512_mask: + case X86::BI__builtin_ia32_loaddqudi128_mask: + case X86::BI__builtin_ia32_loaddqudi256_mask: + case X86::BI__builtin_ia32_loaddqudi512_mask: + return EmitX86MaskedLoad(*this, Ops, 1); + + case X86::BI__builtin_ia32_loadaps128_mask: + case X86::BI__builtin_ia32_loadaps256_mask: + case X86::BI__builtin_ia32_loadaps512_mask: + case X86::BI__builtin_ia32_loadapd128_mask: + case X86::BI__builtin_ia32_loadapd256_mask: + case X86::BI__builtin_ia32_loadapd512_mask: + case X86::BI__builtin_ia32_movdqa32load128_mask: + case X86::BI__builtin_ia32_movdqa32load256_mask: + case X86::BI__builtin_ia32_movdqa32load512_mask: + case X86::BI__builtin_ia32_movdqa64load128_mask: + case X86::BI__builtin_ia32_movdqa64load256_mask: + case X86::BI__builtin_ia32_movdqa64load512_mask: { + unsigned Align = + getContext().getTypeAlignInChars(E->getArg(1)->getType()).getQuantity(); + return EmitX86MaskedLoad(*this, Ops, Align); + } case X86::BI__builtin_ia32_storehps: case X86::BI__builtin_ia32_storelps: { llvm::Type *PtrTy = llvm::PointerType::getUnqual(Int64Ty); @@ -6330,103 +7014,50 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); } case X86::BI__builtin_ia32_palignr128: - case X86::BI__builtin_ia32_palignr256: { + case X86::BI__builtin_ia32_palignr256: + case X86::BI__builtin_ia32_palignr128_mask: + case X86::BI__builtin_ia32_palignr256_mask: + case X86::BI__builtin_ia32_palignr512_mask: { unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); - unsigned NumElts = - cast<llvm::VectorType>(Ops[0]->getType())->getNumElements(); + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); assert(NumElts % 16 == 0); - unsigned NumLanes = NumElts / 16; - unsigned NumLaneElts = NumElts / NumLanes; // If palignr is shifting the pair of vectors more than the size of two // lanes, emit zero. - if (ShiftVal >= (2 * NumLaneElts)) + if (ShiftVal >= 32) return llvm::Constant::getNullValue(ConvertType(E->getType())); // If palignr is shifting the pair of input vectors more than one lane, // but less than two lanes, convert to shifting in zeroes. - if (ShiftVal > NumLaneElts) { - ShiftVal -= NumLaneElts; + if (ShiftVal > 16) { + ShiftVal -= 16; Ops[1] = Ops[0]; Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType()); } - uint32_t Indices[32]; + uint32_t Indices[64]; // 256-bit palignr operates on 128-bit lanes so we need to handle that - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0; i != NumLaneElts; ++i) { + for (unsigned l = 0; l != NumElts; l += 16) { + for (unsigned i = 0; i != 16; ++i) { unsigned Idx = ShiftVal + i; - if (Idx >= NumLaneElts) - Idx += NumElts - NumLaneElts; // End of lane, switch operand. + if (Idx >= 16) + Idx += NumElts - 16; // End of lane, switch operand. Indices[l + i] = Idx + l; } } - Value *SV = llvm::ConstantDataVector::get(getLLVMContext(), - makeArrayRef(Indices, NumElts)); - return Builder.CreateShuffleVector(Ops[1], Ops[0], SV, "palignr"); - } - case X86::BI__builtin_ia32_pslldqi256: { - // Shift value is in bits so divide by 8. - unsigned shiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() >> 3; + Value *Align = Builder.CreateShuffleVector(Ops[1], Ops[0], + makeArrayRef(Indices, NumElts), + "palignr"); - // If pslldq is shifting the vector more than 15 bytes, emit zero. - if (shiftVal >= 16) - return llvm::Constant::getNullValue(ConvertType(E->getType())); - - uint32_t Indices[32]; - // 256-bit pslldq operates on 128-bit lanes so we need to handle that - for (unsigned l = 0; l != 32; l += 16) { - for (unsigned i = 0; i != 16; ++i) { - unsigned Idx = 32 + i - shiftVal; - if (Idx < 32) Idx -= 16; // end of lane, switch operand. - Indices[l + i] = Idx + l; - } - } - - llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, 32); - Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast"); - Value *Zero = llvm::Constant::getNullValue(VecTy); + // If this isn't a masked builtin, just return the align operation. + if (Ops.size() == 3) + return Align; - Value *SV = llvm::ConstantDataVector::get(getLLVMContext(), Indices); - SV = Builder.CreateShuffleVector(Zero, Ops[0], SV, "pslldq"); - llvm::Type *ResultType = ConvertType(E->getType()); - return Builder.CreateBitCast(SV, ResultType, "cast"); + return EmitX86Select(*this, Ops[4], Align, Ops[3]); } - case X86::BI__builtin_ia32_psrldqi256: { - // Shift value is in bits so divide by 8. - unsigned shiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() >> 3; - - // If psrldq is shifting the vector more than 15 bytes, emit zero. - if (shiftVal >= 16) - return llvm::Constant::getNullValue(ConvertType(E->getType())); - uint32_t Indices[32]; - // 256-bit psrldq operates on 128-bit lanes so we need to handle that - for (unsigned l = 0; l != 32; l += 16) { - for (unsigned i = 0; i != 16; ++i) { - unsigned Idx = i + shiftVal; - if (Idx >= 16) Idx += 16; // end of lane, switch operand. - Indices[l + i] = Idx + l; - } - } - - llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, 32); - Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast"); - Value *Zero = llvm::Constant::getNullValue(VecTy); - - Value *SV = llvm::ConstantDataVector::get(getLLVMContext(), Indices); - SV = Builder.CreateShuffleVector(Ops[0], Zero, SV, "psrldq"); - llvm::Type *ResultType = ConvertType(E->getType()); - return Builder.CreateBitCast(SV, ResultType, "cast"); - } - case X86::BI__builtin_ia32_movntps: - case X86::BI__builtin_ia32_movntps256: - case X86::BI__builtin_ia32_movntpd: - case X86::BI__builtin_ia32_movntpd256: - case X86::BI__builtin_ia32_movntdq: - case X86::BI__builtin_ia32_movntdq256: case X86::BI__builtin_ia32_movnti: case X86::BI__builtin_ia32_movnti64: { llvm::MDNode *Node = llvm::MDNode::get( @@ -6439,17 +7070,156 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC); SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node); - // If the operand is an integer, we can't assume alignment. Otherwise, - // assume natural alignment. - QualType ArgTy = E->getArg(1)->getType(); - unsigned Align; - if (ArgTy->isIntegerType()) - Align = 1; - else - Align = getContext().getTypeSizeInChars(ArgTy).getQuantity(); - SI->setAlignment(Align); + // No alignment for scalar intrinsic store. + SI->setAlignment(1); + return SI; + } + case X86::BI__builtin_ia32_movntsd: + case X86::BI__builtin_ia32_movntss: { + llvm::MDNode *Node = llvm::MDNode::get( + getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1))); + + // Extract the 0'th element of the source vector. + Value *Scl = Builder.CreateExtractElement(Ops[1], (uint64_t)0, "extract"); + + // Convert the type of the pointer to a pointer to the stored type. + Value *BC = Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(Scl->getType()), + "cast"); + + // Unaligned nontemporal store of the scalar value. + StoreInst *SI = Builder.CreateDefaultAlignedStore(Scl, BC); + SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node); + SI->setAlignment(1); return SI; } + + case X86::BI__builtin_ia32_selectb_128: + case X86::BI__builtin_ia32_selectb_256: + case X86::BI__builtin_ia32_selectb_512: + case X86::BI__builtin_ia32_selectw_128: + case X86::BI__builtin_ia32_selectw_256: + case X86::BI__builtin_ia32_selectw_512: + case X86::BI__builtin_ia32_selectd_128: + case X86::BI__builtin_ia32_selectd_256: + case X86::BI__builtin_ia32_selectd_512: + case X86::BI__builtin_ia32_selectq_128: + case X86::BI__builtin_ia32_selectq_256: + case X86::BI__builtin_ia32_selectq_512: + case X86::BI__builtin_ia32_selectps_128: + case X86::BI__builtin_ia32_selectps_256: + case X86::BI__builtin_ia32_selectps_512: + case X86::BI__builtin_ia32_selectpd_128: + case X86::BI__builtin_ia32_selectpd_256: + case X86::BI__builtin_ia32_selectpd_512: + return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]); + case X86::BI__builtin_ia32_pcmpeqb128_mask: + case X86::BI__builtin_ia32_pcmpeqb256_mask: + case X86::BI__builtin_ia32_pcmpeqb512_mask: + case X86::BI__builtin_ia32_pcmpeqw128_mask: + case X86::BI__builtin_ia32_pcmpeqw256_mask: + case X86::BI__builtin_ia32_pcmpeqw512_mask: + case X86::BI__builtin_ia32_pcmpeqd128_mask: + case X86::BI__builtin_ia32_pcmpeqd256_mask: + case X86::BI__builtin_ia32_pcmpeqd512_mask: + case X86::BI__builtin_ia32_pcmpeqq128_mask: + case X86::BI__builtin_ia32_pcmpeqq256_mask: + case X86::BI__builtin_ia32_pcmpeqq512_mask: + return EmitX86MaskedCompare(*this, 0, false, Ops); + case X86::BI__builtin_ia32_pcmpgtb128_mask: + case X86::BI__builtin_ia32_pcmpgtb256_mask: + case X86::BI__builtin_ia32_pcmpgtb512_mask: + case X86::BI__builtin_ia32_pcmpgtw128_mask: + case X86::BI__builtin_ia32_pcmpgtw256_mask: + case X86::BI__builtin_ia32_pcmpgtw512_mask: + case X86::BI__builtin_ia32_pcmpgtd128_mask: + case X86::BI__builtin_ia32_pcmpgtd256_mask: + case X86::BI__builtin_ia32_pcmpgtd512_mask: + case X86::BI__builtin_ia32_pcmpgtq128_mask: + case X86::BI__builtin_ia32_pcmpgtq256_mask: + case X86::BI__builtin_ia32_pcmpgtq512_mask: + return EmitX86MaskedCompare(*this, 6, true, Ops); + case X86::BI__builtin_ia32_cmpb128_mask: + case X86::BI__builtin_ia32_cmpb256_mask: + case X86::BI__builtin_ia32_cmpb512_mask: + case X86::BI__builtin_ia32_cmpw128_mask: + case X86::BI__builtin_ia32_cmpw256_mask: + case X86::BI__builtin_ia32_cmpw512_mask: + case X86::BI__builtin_ia32_cmpd128_mask: + case X86::BI__builtin_ia32_cmpd256_mask: + case X86::BI__builtin_ia32_cmpd512_mask: + case X86::BI__builtin_ia32_cmpq128_mask: + case X86::BI__builtin_ia32_cmpq256_mask: + case X86::BI__builtin_ia32_cmpq512_mask: { + unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7; + return EmitX86MaskedCompare(*this, CC, true, Ops); + } + case X86::BI__builtin_ia32_ucmpb128_mask: + case X86::BI__builtin_ia32_ucmpb256_mask: + case X86::BI__builtin_ia32_ucmpb512_mask: + case X86::BI__builtin_ia32_ucmpw128_mask: + case X86::BI__builtin_ia32_ucmpw256_mask: + case X86::BI__builtin_ia32_ucmpw512_mask: + case X86::BI__builtin_ia32_ucmpd128_mask: + case X86::BI__builtin_ia32_ucmpd256_mask: + case X86::BI__builtin_ia32_ucmpd512_mask: + case X86::BI__builtin_ia32_ucmpq128_mask: + case X86::BI__builtin_ia32_ucmpq256_mask: + case X86::BI__builtin_ia32_ucmpq512_mask: { + unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7; + return EmitX86MaskedCompare(*this, CC, false, Ops); + } + + case X86::BI__builtin_ia32_vplzcntd_128_mask: + case X86::BI__builtin_ia32_vplzcntd_256_mask: + case X86::BI__builtin_ia32_vplzcntd_512_mask: + case X86::BI__builtin_ia32_vplzcntq_128_mask: + case X86::BI__builtin_ia32_vplzcntq_256_mask: + case X86::BI__builtin_ia32_vplzcntq_512_mask: { + Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType()); + return EmitX86Select(*this, Ops[2], + Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)}), + Ops[1]); + } + + // TODO: Handle 64/512-bit vector widths of min/max. + case X86::BI__builtin_ia32_pmaxsb128: + case X86::BI__builtin_ia32_pmaxsw128: + case X86::BI__builtin_ia32_pmaxsd128: + case X86::BI__builtin_ia32_pmaxsb256: + case X86::BI__builtin_ia32_pmaxsw256: + case X86::BI__builtin_ia32_pmaxsd256: { + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SGT, Ops[0], Ops[1]); + return Builder.CreateSelect(Cmp, Ops[0], Ops[1]); + } + case X86::BI__builtin_ia32_pmaxub128: + case X86::BI__builtin_ia32_pmaxuw128: + case X86::BI__builtin_ia32_pmaxud128: + case X86::BI__builtin_ia32_pmaxub256: + case X86::BI__builtin_ia32_pmaxuw256: + case X86::BI__builtin_ia32_pmaxud256: { + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, Ops[0], Ops[1]); + return Builder.CreateSelect(Cmp, Ops[0], Ops[1]); + } + case X86::BI__builtin_ia32_pminsb128: + case X86::BI__builtin_ia32_pminsw128: + case X86::BI__builtin_ia32_pminsd128: + case X86::BI__builtin_ia32_pminsb256: + case X86::BI__builtin_ia32_pminsw256: + case X86::BI__builtin_ia32_pminsd256: { + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SLT, Ops[0], Ops[1]); + return Builder.CreateSelect(Cmp, Ops[0], Ops[1]); + } + case X86::BI__builtin_ia32_pminub128: + case X86::BI__builtin_ia32_pminuw128: + case X86::BI__builtin_ia32_pminud128: + case X86::BI__builtin_ia32_pminub256: + case X86::BI__builtin_ia32_pminuw256: + case X86::BI__builtin_ia32_pminud256: { + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, Ops[0], Ops[1]); + return Builder.CreateSelect(Cmp, Ops[0], Ops[1]); + } + // 3DNow! case X86::BI__builtin_ia32_pswapdsf: case X86::BI__builtin_ia32_pswapdsi: { @@ -6492,154 +7262,107 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Ops[0]); return Builder.CreateExtractValue(Call, 1); } - // SSE comparison intrisics + + // SSE packed comparison intrinsics case X86::BI__builtin_ia32_cmpeqps: + case X86::BI__builtin_ia32_cmpeqpd: + return getVectorFCmpIR(CmpInst::FCMP_OEQ); case X86::BI__builtin_ia32_cmpltps: + case X86::BI__builtin_ia32_cmpltpd: + return getVectorFCmpIR(CmpInst::FCMP_OLT); case X86::BI__builtin_ia32_cmpleps: + case X86::BI__builtin_ia32_cmplepd: + return getVectorFCmpIR(CmpInst::FCMP_OLE); case X86::BI__builtin_ia32_cmpunordps: + case X86::BI__builtin_ia32_cmpunordpd: + return getVectorFCmpIR(CmpInst::FCMP_UNO); case X86::BI__builtin_ia32_cmpneqps: + case X86::BI__builtin_ia32_cmpneqpd: + return getVectorFCmpIR(CmpInst::FCMP_UNE); case X86::BI__builtin_ia32_cmpnltps: + case X86::BI__builtin_ia32_cmpnltpd: + return getVectorFCmpIR(CmpInst::FCMP_UGE); case X86::BI__builtin_ia32_cmpnleps: + case X86::BI__builtin_ia32_cmpnlepd: + return getVectorFCmpIR(CmpInst::FCMP_UGT); case X86::BI__builtin_ia32_cmpordps: + case X86::BI__builtin_ia32_cmpordpd: + return getVectorFCmpIR(CmpInst::FCMP_ORD); + case X86::BI__builtin_ia32_cmpps: + case X86::BI__builtin_ia32_cmpps256: + case X86::BI__builtin_ia32_cmppd: + case X86::BI__builtin_ia32_cmppd256: { + unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); + // If this one of the SSE immediates, we can use native IR. + if (CC < 8) { + FCmpInst::Predicate Pred; + switch (CC) { + case 0: Pred = FCmpInst::FCMP_OEQ; break; + case 1: Pred = FCmpInst::FCMP_OLT; break; + case 2: Pred = FCmpInst::FCMP_OLE; break; + case 3: Pred = FCmpInst::FCMP_UNO; break; + case 4: Pred = FCmpInst::FCMP_UNE; break; + case 5: Pred = FCmpInst::FCMP_UGE; break; + case 6: Pred = FCmpInst::FCMP_UGT; break; + case 7: Pred = FCmpInst::FCMP_ORD; break; + } + return getVectorFCmpIR(Pred); + } + + // We can't handle 8-31 immediates with native IR, use the intrinsic. + Intrinsic::ID ID; + switch (BuiltinID) { + default: llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_cmpps: + ID = Intrinsic::x86_sse_cmp_ps; + break; + case X86::BI__builtin_ia32_cmpps256: + ID = Intrinsic::x86_avx_cmp_ps_256; + break; + case X86::BI__builtin_ia32_cmppd: + ID = Intrinsic::x86_sse2_cmp_pd; + break; + case X86::BI__builtin_ia32_cmppd256: + ID = Intrinsic::x86_avx_cmp_pd_256; + break; + } + + return Builder.CreateCall(CGM.getIntrinsic(ID), Ops); + } + + // SSE scalar comparison intrinsics case X86::BI__builtin_ia32_cmpeqss: + return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0); case X86::BI__builtin_ia32_cmpltss: + return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1); case X86::BI__builtin_ia32_cmpless: + return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2); case X86::BI__builtin_ia32_cmpunordss: + return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3); case X86::BI__builtin_ia32_cmpneqss: + return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4); case X86::BI__builtin_ia32_cmpnltss: + return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5); case X86::BI__builtin_ia32_cmpnless: + return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6); case X86::BI__builtin_ia32_cmpordss: - case X86::BI__builtin_ia32_cmpeqpd: - case X86::BI__builtin_ia32_cmpltpd: - case X86::BI__builtin_ia32_cmplepd: - case X86::BI__builtin_ia32_cmpunordpd: - case X86::BI__builtin_ia32_cmpneqpd: - case X86::BI__builtin_ia32_cmpnltpd: - case X86::BI__builtin_ia32_cmpnlepd: - case X86::BI__builtin_ia32_cmpordpd: + return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7); case X86::BI__builtin_ia32_cmpeqsd: + return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0); case X86::BI__builtin_ia32_cmpltsd: + return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1); case X86::BI__builtin_ia32_cmplesd: + return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2); case X86::BI__builtin_ia32_cmpunordsd: + return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3); case X86::BI__builtin_ia32_cmpneqsd: + return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4); case X86::BI__builtin_ia32_cmpnltsd: + return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5); case X86::BI__builtin_ia32_cmpnlesd: + return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6); case X86::BI__builtin_ia32_cmpordsd: - // These exist so that the builtin that takes an immediate can be bounds - // checked by clang to avoid passing bad immediates to the backend. Since - // AVX has a larger immediate than SSE we would need separate builtins to - // do the different bounds checking. Rather than create a clang specific - // SSE only builtin, this implements eight separate builtins to match gcc - // implementation. - - // Choose the immediate. - unsigned Imm; - switch (BuiltinID) { - default: llvm_unreachable("Unsupported intrinsic!"); - case X86::BI__builtin_ia32_cmpeqps: - case X86::BI__builtin_ia32_cmpeqss: - case X86::BI__builtin_ia32_cmpeqpd: - case X86::BI__builtin_ia32_cmpeqsd: - Imm = 0; - break; - case X86::BI__builtin_ia32_cmpltps: - case X86::BI__builtin_ia32_cmpltss: - case X86::BI__builtin_ia32_cmpltpd: - case X86::BI__builtin_ia32_cmpltsd: - Imm = 1; - break; - case X86::BI__builtin_ia32_cmpleps: - case X86::BI__builtin_ia32_cmpless: - case X86::BI__builtin_ia32_cmplepd: - case X86::BI__builtin_ia32_cmplesd: - Imm = 2; - break; - case X86::BI__builtin_ia32_cmpunordps: - case X86::BI__builtin_ia32_cmpunordss: - case X86::BI__builtin_ia32_cmpunordpd: - case X86::BI__builtin_ia32_cmpunordsd: - Imm = 3; - break; - case X86::BI__builtin_ia32_cmpneqps: - case X86::BI__builtin_ia32_cmpneqss: - case X86::BI__builtin_ia32_cmpneqpd: - case X86::BI__builtin_ia32_cmpneqsd: - Imm = 4; - break; - case X86::BI__builtin_ia32_cmpnltps: - case X86::BI__builtin_ia32_cmpnltss: - case X86::BI__builtin_ia32_cmpnltpd: - case X86::BI__builtin_ia32_cmpnltsd: - Imm = 5; - break; - case X86::BI__builtin_ia32_cmpnleps: - case X86::BI__builtin_ia32_cmpnless: - case X86::BI__builtin_ia32_cmpnlepd: - case X86::BI__builtin_ia32_cmpnlesd: - Imm = 6; - break; - case X86::BI__builtin_ia32_cmpordps: - case X86::BI__builtin_ia32_cmpordss: - case X86::BI__builtin_ia32_cmpordpd: - case X86::BI__builtin_ia32_cmpordsd: - Imm = 7; - break; - } - - // Choose the intrinsic ID. - const char *name; - Intrinsic::ID ID; - switch (BuiltinID) { - default: llvm_unreachable("Unsupported intrinsic!"); - case X86::BI__builtin_ia32_cmpeqps: - case X86::BI__builtin_ia32_cmpltps: - case X86::BI__builtin_ia32_cmpleps: - case X86::BI__builtin_ia32_cmpunordps: - case X86::BI__builtin_ia32_cmpneqps: - case X86::BI__builtin_ia32_cmpnltps: - case X86::BI__builtin_ia32_cmpnleps: - case X86::BI__builtin_ia32_cmpordps: - name = "cmpps"; - ID = Intrinsic::x86_sse_cmp_ps; - break; - case X86::BI__builtin_ia32_cmpeqss: - case X86::BI__builtin_ia32_cmpltss: - case X86::BI__builtin_ia32_cmpless: - case X86::BI__builtin_ia32_cmpunordss: - case X86::BI__builtin_ia32_cmpneqss: - case X86::BI__builtin_ia32_cmpnltss: - case X86::BI__builtin_ia32_cmpnless: - case X86::BI__builtin_ia32_cmpordss: - name = "cmpss"; - ID = Intrinsic::x86_sse_cmp_ss; - break; - case X86::BI__builtin_ia32_cmpeqpd: - case X86::BI__builtin_ia32_cmpltpd: - case X86::BI__builtin_ia32_cmplepd: - case X86::BI__builtin_ia32_cmpunordpd: - case X86::BI__builtin_ia32_cmpneqpd: - case X86::BI__builtin_ia32_cmpnltpd: - case X86::BI__builtin_ia32_cmpnlepd: - case X86::BI__builtin_ia32_cmpordpd: - name = "cmppd"; - ID = Intrinsic::x86_sse2_cmp_pd; - break; - case X86::BI__builtin_ia32_cmpeqsd: - case X86::BI__builtin_ia32_cmpltsd: - case X86::BI__builtin_ia32_cmplesd: - case X86::BI__builtin_ia32_cmpunordsd: - case X86::BI__builtin_ia32_cmpneqsd: - case X86::BI__builtin_ia32_cmpnltsd: - case X86::BI__builtin_ia32_cmpnlesd: - case X86::BI__builtin_ia32_cmpordsd: - name = "cmpsd"; - ID = Intrinsic::x86_sse2_cmp_sd; - break; - } - - Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm)); - llvm::Function *F = CGM.getIntrinsic(ID); - return Builder.CreateCall(F, Ops, name); + return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7); } } @@ -6812,6 +7535,16 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(ID, ResultType); return Builder.CreateCall(F, X); } + + // Absolute value + case PPC::BI__builtin_vsx_xvabsdp: + case PPC::BI__builtin_vsx_xvabssp: { + llvm::Type *ResultType = ConvertType(E->getType()); + Value *X = EmitScalarExpr(E->getArg(0)); + llvm::Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType); + return Builder.CreateCall(F, X); + } + // FMA variations case PPC::BI__builtin_vsx_xvmaddadp: case PPC::BI__builtin_vsx_xvmaddasp: @@ -6851,44 +7584,11 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID, } } -// Emit an intrinsic that has 1 float or double. -static Value *emitUnaryFPBuiltin(CodeGenFunction &CGF, - const CallExpr *E, - unsigned IntrinsicID) { - llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); - - Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); - return CGF.Builder.CreateCall(F, Src0); -} - -// Emit an intrinsic that has 3 float or double operands. -static Value *emitTernaryFPBuiltin(CodeGenFunction &CGF, - const CallExpr *E, - unsigned IntrinsicID) { - llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); - llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); - llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2)); - - Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); - return CGF.Builder.CreateCall(F, {Src0, Src1, Src2}); -} - -// Emit an intrinsic that has 1 float or double operand, and 1 integer. -static Value *emitFPIntBuiltin(CodeGenFunction &CGF, - const CallExpr *E, - unsigned IntrinsicID) { - llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0)); - llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1)); - - Value *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType()); - return CGF.Builder.CreateCall(F, {Src0, Src1}); -} - Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgpu_div_scale: - case AMDGPU::BI__builtin_amdgpu_div_scalef: { + case AMDGPU::BI__builtin_amdgcn_div_scale: + case AMDGPU::BI__builtin_amdgcn_div_scalef: { // Translate from the intrinsics's struct return to the builtin's out // argument. @@ -6898,7 +7598,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *Y = EmitScalarExpr(E->getArg(1)); llvm::Value *Z = EmitScalarExpr(E->getArg(2)); - llvm::Value *Callee = CGM.getIntrinsic(Intrinsic::AMDGPU_div_scale, + llvm::Value *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale, X->getType()); llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z}); @@ -6913,40 +7613,85 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Builder.CreateStore(FlagExt, FlagOutPtr); return Result; } - case AMDGPU::BI__builtin_amdgpu_div_fmas: - case AMDGPU::BI__builtin_amdgpu_div_fmasf: { + case AMDGPU::BI__builtin_amdgcn_div_fmas: + case AMDGPU::BI__builtin_amdgcn_div_fmasf: { llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); - llvm::Value *F = CGM.getIntrinsic(Intrinsic::AMDGPU_div_fmas, + llvm::Value *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas, Src0->getType()); llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3); return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool}); } - case AMDGPU::BI__builtin_amdgpu_div_fixup: - case AMDGPU::BI__builtin_amdgpu_div_fixupf: - return emitTernaryFPBuiltin(*this, E, Intrinsic::AMDGPU_div_fixup); - case AMDGPU::BI__builtin_amdgpu_trig_preop: - case AMDGPU::BI__builtin_amdgpu_trig_preopf: - return emitFPIntBuiltin(*this, E, Intrinsic::AMDGPU_trig_preop); - case AMDGPU::BI__builtin_amdgpu_rcp: - case AMDGPU::BI__builtin_amdgpu_rcpf: - return emitUnaryFPBuiltin(*this, E, Intrinsic::AMDGPU_rcp); - case AMDGPU::BI__builtin_amdgpu_rsq: - case AMDGPU::BI__builtin_amdgpu_rsqf: - return emitUnaryFPBuiltin(*this, E, Intrinsic::AMDGPU_rsq); - case AMDGPU::BI__builtin_amdgpu_rsq_clamped: - case AMDGPU::BI__builtin_amdgpu_rsq_clampedf: - return emitUnaryFPBuiltin(*this, E, Intrinsic::AMDGPU_rsq_clamped); - case AMDGPU::BI__builtin_amdgpu_ldexp: - case AMDGPU::BI__builtin_amdgpu_ldexpf: - return emitFPIntBuiltin(*this, E, Intrinsic::AMDGPU_ldexp); - case AMDGPU::BI__builtin_amdgpu_class: - case AMDGPU::BI__builtin_amdgpu_classf: - return emitFPIntBuiltin(*this, E, Intrinsic::AMDGPU_class); - default: + case AMDGPU::BI__builtin_amdgcn_div_fixup: + case AMDGPU::BI__builtin_amdgcn_div_fixupf: + return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_div_fixup); + case AMDGPU::BI__builtin_amdgcn_trig_preop: + case AMDGPU::BI__builtin_amdgcn_trig_preopf: + return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop); + case AMDGPU::BI__builtin_amdgcn_rcp: + case AMDGPU::BI__builtin_amdgcn_rcpf: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rcp); + case AMDGPU::BI__builtin_amdgcn_rsq: + case AMDGPU::BI__builtin_amdgcn_rsqf: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq); + case AMDGPU::BI__builtin_amdgcn_rsq_clamp: + case AMDGPU::BI__builtin_amdgcn_rsq_clampf: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp); + case AMDGPU::BI__builtin_amdgcn_sinf: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin); + case AMDGPU::BI__builtin_amdgcn_cosf: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos); + case AMDGPU::BI__builtin_amdgcn_log_clampf: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp); + case AMDGPU::BI__builtin_amdgcn_ldexp: + case AMDGPU::BI__builtin_amdgcn_ldexpf: + return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_ldexp); + case AMDGPU::BI__builtin_amdgcn_frexp_mant: + case AMDGPU::BI__builtin_amdgcn_frexp_mantf: { + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_mant); + } + case AMDGPU::BI__builtin_amdgcn_frexp_exp: + case AMDGPU::BI__builtin_amdgcn_frexp_expf: { + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_exp); + } + case AMDGPU::BI__builtin_amdgcn_fract: + case AMDGPU::BI__builtin_amdgcn_fractf: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_fract); + case AMDGPU::BI__builtin_amdgcn_lerp: + return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_lerp); + case AMDGPU::BI__builtin_amdgcn_class: + case AMDGPU::BI__builtin_amdgcn_classf: + return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class); + + case AMDGPU::BI__builtin_amdgcn_read_exec: { + CallInst *CI = cast<CallInst>( + EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, true, "exec")); + CI->setConvergent(); + return CI; + } + + // amdgcn workitem + case AMDGPU::BI__builtin_amdgcn_workitem_id_x: + return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024); + case AMDGPU::BI__builtin_amdgcn_workitem_id_y: + return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024); + case AMDGPU::BI__builtin_amdgcn_workitem_id_z: + return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024); + + // r600 intrinsics + case AMDGPU::BI__builtin_r600_recipsqrt_ieee: + case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: + return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee); + case AMDGPU::BI__builtin_r600_read_tidig_x: + return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024); + case AMDGPU::BI__builtin_r600_read_tidig_y: + return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024); + case AMDGPU::BI__builtin_r600_read_tidig_z: + return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024); + default: return nullptr; } } @@ -7196,6 +7941,17 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID, Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { + auto MakeLdg = [&](unsigned IntrinsicID) { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + AlignmentSource AlignSource; + clang::CharUnits Align = + getNaturalPointeeTypeAlignment(E->getArg(0)->getType(), &AlignSource); + return Builder.CreateCall( + CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(), + Ptr->getType()}), + {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())}); + }; + switch (BuiltinID) { case NVPTX::BI__nvvm_atom_add_gen_i: case NVPTX::BI__nvvm_atom_add_gen_l: @@ -7264,6 +8020,56 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(FnALAF32, {Ptr, Val}); } + case NVPTX::BI__nvvm_atom_inc_gen_ui: { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Value *Val = EmitScalarExpr(E->getArg(1)); + Value *FnALI32 = + CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType()); + return Builder.CreateCall(FnALI32, {Ptr, Val}); + } + + case NVPTX::BI__nvvm_atom_dec_gen_ui: { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Value *Val = EmitScalarExpr(E->getArg(1)); + Value *FnALD32 = + CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType()); + return Builder.CreateCall(FnALD32, {Ptr, Val}); + } + + case NVPTX::BI__nvvm_ldg_c: + case NVPTX::BI__nvvm_ldg_c2: + case NVPTX::BI__nvvm_ldg_c4: + case NVPTX::BI__nvvm_ldg_s: + case NVPTX::BI__nvvm_ldg_s2: + case NVPTX::BI__nvvm_ldg_s4: + case NVPTX::BI__nvvm_ldg_i: + case NVPTX::BI__nvvm_ldg_i2: + case NVPTX::BI__nvvm_ldg_i4: + case NVPTX::BI__nvvm_ldg_l: + case NVPTX::BI__nvvm_ldg_ll: + case NVPTX::BI__nvvm_ldg_ll2: + case NVPTX::BI__nvvm_ldg_uc: + case NVPTX::BI__nvvm_ldg_uc2: + case NVPTX::BI__nvvm_ldg_uc4: + case NVPTX::BI__nvvm_ldg_us: + case NVPTX::BI__nvvm_ldg_us2: + case NVPTX::BI__nvvm_ldg_us4: + case NVPTX::BI__nvvm_ldg_ui: + case NVPTX::BI__nvvm_ldg_ui2: + case NVPTX::BI__nvvm_ldg_ui4: + case NVPTX::BI__nvvm_ldg_ul: + case NVPTX::BI__nvvm_ldg_ull: + case NVPTX::BI__nvvm_ldg_ull2: + // PTX Interoperability section 2.2: "For a vector with an even number of + // elements, its alignment is set to number of elements times the alignment + // of its member: n*alignof(t)." + return MakeLdg(Intrinsic::nvvm_ldg_global_i); + case NVPTX::BI__nvvm_ldg_f: + case NVPTX::BI__nvvm_ldg_f2: + case NVPTX::BI__nvvm_ldg_f4: + case NVPTX::BI__nvvm_ldg_d: + case NVPTX::BI__nvvm_ldg_d2: + return MakeLdg(Intrinsic::nvvm_ldg_global_f); default: return nullptr; } @@ -7272,9 +8078,9 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_memory_size: { + case WebAssembly::BI__builtin_wasm_current_memory: { llvm::Type *ResultType = ConvertType(E->getType()); - Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_size, ResultType); + Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_current_memory, ResultType); return Builder.CreateCall(Callee); } case WebAssembly::BI__builtin_wasm_grow_memory: { |