diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Headers')
35 files changed, 5703 insertions, 1835 deletions
diff --git a/contrib/llvm/tools/clang/lib/Headers/altivec.h b/contrib/llvm/tools/clang/lib/Headers/altivec.h index a01d9d8..90fd477 100644 --- a/contrib/llvm/tools/clang/lib/Headers/altivec.h +++ b/contrib/llvm/tools/clang/lib/Headers/altivec.h @@ -2887,87 +2887,79 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a, /* vec_ctf */ -static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) { - return __builtin_altivec_vcfsx(__a, __b); -} - -static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a, - int __b) { - return __builtin_altivec_vcfux((vector int)__a, __b); -} - #ifdef __VSX__ -static __inline__ vector double __ATTRS_o_ai -vec_ctf(vector unsigned long long __a, int __b) { - vector double __ret = __builtin_convertvector(__a, vector double); - __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52); - return __ret; -} - -static __inline__ vector double __ATTRS_o_ai -vec_ctf(vector signed long long __a, int __b) { - vector double __ret = __builtin_convertvector(__a, vector double); - __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52); - return __ret; -} +#define vec_ctf(__a, __b) \ + _Generic((__a), vector int \ + : (vector float)__builtin_altivec_vcfsx((__a), (__b)), \ + vector unsigned int \ + : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)), \ + vector unsigned long long \ + : (__builtin_convertvector((vector unsigned long long)(__a), \ + vector double) * \ + (vector double)(vector unsigned long long)((0x3ffULL - (__b)) \ + << 52)), \ + vector signed long long \ + : (__builtin_convertvector((vector signed long long)(__a), \ + vector double) * \ + (vector double)(vector unsigned long long)((0x3ffULL - (__b)) \ + << 52))) +#else +#define vec_ctf(__a, __b) \ + _Generic((__a), vector int \ + : (vector float)__builtin_altivec_vcfsx((__a), (__b)), \ + vector unsigned int \ + : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b))) #endif /* vec_vcfsx */ -static __inline__ vector float __attribute__((__always_inline__)) -vec_vcfsx(vector int __a, int __b) { - return __builtin_altivec_vcfsx(__a, __b); -} +#define vec_vcfux __builtin_altivec_vcfux /* vec_vcfux */ -static __inline__ vector float __attribute__((__always_inline__)) -vec_vcfux(vector unsigned int __a, int __b) { - return __builtin_altivec_vcfux((vector int)__a, __b); -} +#define vec_vcfsx(__a, __b) __builtin_altivec_vcfsx((vector int)(__a), (__b)) /* vec_cts */ -static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) { - return __builtin_altivec_vctsxs(__a, __b); -} - #ifdef __VSX__ -static __inline__ vector signed long long __ATTRS_o_ai -vec_cts(vector double __a, int __b) { - __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52); - return __builtin_convertvector(__a, vector signed long long); -} +#define vec_cts(__a, __b) \ + _Generic((__a), vector float \ + : __builtin_altivec_vctsxs((__a), (__b)), vector double \ + : __extension__({ \ + vector double __ret = \ + (__a) * \ + (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \ + << 52); \ + __builtin_convertvector(__ret, vector signed long long); \ + })) +#else +#define vec_cts __builtin_altivec_vctsxs #endif /* vec_vctsxs */ -static __inline__ vector int __attribute__((__always_inline__)) -vec_vctsxs(vector float __a, int __b) { - return __builtin_altivec_vctsxs(__a, __b); -} +#define vec_vctsxs __builtin_altivec_vctsxs /* vec_ctu */ -static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a, - int __b) { - return __builtin_altivec_vctuxs(__a, __b); -} - #ifdef __VSX__ -static __inline__ vector unsigned long long __ATTRS_o_ai -vec_ctu(vector double __a, int __b) { - __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52); - return __builtin_convertvector(__a, vector unsigned long long); -} +#define vec_ctu(__a, __b) \ + _Generic((__a), vector float \ + : __builtin_altivec_vctuxs((__a), (__b)), vector double \ + : __extension__({ \ + vector double __ret = \ + (__a) * \ + (vector double)(vector unsigned long long)((0x3ffULL + __b) \ + << 52); \ + __builtin_convertvector(__ret, vector unsigned long long); \ + })) +#else +#define vec_ctu __builtin_altivec_vctuxs #endif /* vec_vctuxs */ -static __inline__ vector unsigned int __attribute__((__always_inline__)) -vec_vctuxs(vector float __a, int __b) { - return __builtin_altivec_vctuxs(__a, __b); -} +#define vec_vctuxs __builtin_altivec_vctuxs /* vec_signed */ @@ -8045,45 +8037,51 @@ static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a, /* vec_sl */ -static __inline__ vector signed char __ATTRS_o_ai -vec_sl(vector signed char __a, vector unsigned char __b) { - return __a << (vector signed char)__b; -} - +// vec_sl does modulo arithmetic on __b first, so __b is allowed to be more +// than the length of __a. static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b) { - return __a << __b; + return __a << (__b % + (vector unsigned char)(sizeof(unsigned char) * __CHAR_BIT__)); } -static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a, - vector unsigned short __b) { - return __a << (vector short)__b; +static __inline__ vector signed char __ATTRS_o_ai +vec_sl(vector signed char __a, vector unsigned char __b) { + return (vector signed char)vec_sl((vector unsigned char)__a, __b); } static __inline__ vector unsigned short __ATTRS_o_ai vec_sl(vector unsigned short __a, vector unsigned short __b) { - return __a << __b; + return __a << (__b % (vector unsigned short)(sizeof(unsigned short) * + __CHAR_BIT__)); } -static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a, - vector unsigned int __b) { - return __a << (vector int)__b; +static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a, + vector unsigned short __b) { + return (vector short)vec_sl((vector unsigned short)__a, __b); } static __inline__ vector unsigned int __ATTRS_o_ai vec_sl(vector unsigned int __a, vector unsigned int __b) { - return __a << __b; + return __a << (__b % + (vector unsigned int)(sizeof(unsigned int) * __CHAR_BIT__)); } -#ifdef __POWER8_VECTOR__ -static __inline__ vector signed long long __ATTRS_o_ai -vec_sl(vector signed long long __a, vector unsigned long long __b) { - return __a << (vector long long)__b; +static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a, + vector unsigned int __b) { + return (vector int)vec_sl((vector unsigned int)__a, __b); } +#ifdef __POWER8_VECTOR__ static __inline__ vector unsigned long long __ATTRS_o_ai vec_sl(vector unsigned long long __a, vector unsigned long long __b) { - return __a << __b; + return __a << (__b % (vector unsigned long long)(sizeof(unsigned long long) * + __CHAR_BIT__)); +} + +static __inline__ vector long long __ATTRS_o_ai +vec_sl(vector long long __a, vector unsigned long long __b) { + return (vector long long)vec_sl((vector unsigned long long)__a, __b); } #endif @@ -12150,6 +12148,11 @@ static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a, #endif +#ifdef __VSX__ +#define vec_xxpermdi __builtin_vsx_xxpermdi +#define vec_xxsldwi __builtin_vsx_xxsldwi +#endif + /* vec_xor */ #define __builtin_altivec_vxor vec_xor diff --git a/contrib/llvm/tools/clang/lib/Headers/arm_acle.h b/contrib/llvm/tools/clang/lib/Headers/arm_acle.h index 8423e62..ab25897 100644 --- a/contrib/llvm/tools/clang/lib/Headers/arm_acle.h +++ b/contrib/llvm/tools/clang/lib/Headers/arm_acle.h @@ -225,19 +225,49 @@ __rbitl(unsigned long __t) { } /* + * 9.3 16-bit multiplications + */ +#if __ARM_FEATURE_DSP +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smulbb(int32_t __a, int32_t __b) { + return __builtin_arm_smulbb(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smulbt(int32_t __a, int32_t __b) { + return __builtin_arm_smulbt(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smultb(int32_t __a, int32_t __b) { + return __builtin_arm_smultb(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smultt(int32_t __a, int32_t __b) { + return __builtin_arm_smultt(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smulwb(int32_t __a, int32_t __b) { + return __builtin_arm_smulwb(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smulwt(int32_t __a, int32_t __b) { + return __builtin_arm_smulwt(__a, __b); +} +#endif + +/* * 9.4 Saturating intrinsics * * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag * intrinsics are implemented and the flag is enabled. */ /* 9.4.1 Width-specified saturation intrinsics */ -#if __ARM_32BIT_STATE +#if __ARM_FEATURE_SAT #define __ssat(x, y) __builtin_arm_ssat(x, y) #define __usat(x, y) __builtin_arm_usat(x, y) #endif /* 9.4.2 Saturating addition and subtraction intrinsics */ -#if __ARM_32BIT_STATE +#if __ARM_FEATURE_DSP static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) __qadd(int32_t __t, int32_t __v) { return __builtin_arm_qadd(__t, __v); @@ -254,6 +284,290 @@ __qdbl(int32_t __t) { } #endif +/* 9.4.3 Accumultating multiplications */ +#if __ARM_FEATURE_DSP +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlabb(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlabb(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlabt(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlabt(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlatb(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlatb(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlatt(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlatt(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlawb(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlawb(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlawt(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlawt(__a, __b, __c); +} +#endif + + +/* 9.5.4 Parallel 16-bit saturation */ +#if __ARM_FEATURE_SIMD32 +#define __ssat16(x, y) __builtin_arm_ssat16(x, y) +#define __usat16(x, y) __builtin_arm_usat16(x, y) +#endif + +/* 9.5.5 Packing and unpacking */ +#if __ARM_FEATURE_SIMD32 +typedef int32_t int8x4_t; +typedef int32_t int16x2_t; +typedef uint32_t uint8x4_t; +typedef uint32_t uint16x2_t; + +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__sxtab16(int16x2_t __a, int8x4_t __b) { + return __builtin_arm_sxtab16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__sxtb16(int8x4_t __a) { + return __builtin_arm_sxtb16(__a); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__uxtab16(int16x2_t __a, int8x4_t __b) { + return __builtin_arm_uxtab16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__uxtb16(int8x4_t __a) { + return __builtin_arm_uxtb16(__a); +} +#endif + +/* 9.5.6 Parallel selection */ +#if __ARM_FEATURE_SIMD32 +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__sel(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_sel(__a, __b); +} +#endif + +/* 9.5.7 Parallel 8-bit addition and subtraction */ +#if __ARM_FEATURE_SIMD32 +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__qadd8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_qadd8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__qsub8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_qsub8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__sadd8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_sadd8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__shadd8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_shadd8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__shsub8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_shsub8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__ssub8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_ssub8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uadd8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uadd8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uhadd8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uhadd8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uhsub8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uhsub8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uqadd8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uqadd8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uqsub8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uqsub8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__usub8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_usub8(__a, __b); +} +#endif + +/* 9.5.8 Sum of 8-bit absolute differences */ +#if __ARM_FEATURE_SIMD32 +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) +__usad8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_usad8(__a, __b); +} +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) +__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) { + return __builtin_arm_usada8(__a, __b, __c); +} +#endif + +/* 9.5.9 Parallel 16-bit addition and subtraction */ +#if __ARM_FEATURE_SIMD32 +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__qadd16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_qadd16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__qasx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_qasx(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__qsax(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_qsax(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__qsub16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_qsub16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__sadd16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_sadd16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__sasx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_sasx(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__shadd16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_shadd16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__shasx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_shasx(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__shsax(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_shsax(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__shsub16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_shsub16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__ssax(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_ssax(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__ssub16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_ssub16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uadd16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uadd16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uasx(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uasx(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uhadd16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uhadd16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uhasx(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uhasx(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uhsax(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uhsax(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uhsub16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uhsub16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uqadd16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uqadd16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uqasx(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uqasx(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uqsax(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uqsax(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uqsub16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uqsub16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__usax(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_usax(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__usub16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_usub16(__a, __b); +} +#endif + +/* 9.5.10 Parallel 16-bit multiplications */ +#if __ARM_FEATURE_SIMD32 +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) { + return __builtin_arm_smlad(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) { + return __builtin_arm_smladx(__a, __b, __c); +} +static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) +__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) { + return __builtin_arm_smlald(__a, __b, __c); +} +static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) +__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) { + return __builtin_arm_smlaldx(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) { + return __builtin_arm_smlsd(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) { + return __builtin_arm_smlsdx(__a, __b, __c); +} +static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) +__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) { + return __builtin_arm_smlsld(__a, __b, __c); +} +static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) +__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) { + return __builtin_arm_smlsldx(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smuad(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_smuad(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smuadx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_smuadx(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smusd(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_smusd(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smusdx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_smusdx(__a, __b); +} +#endif + /* 9.7 CRC32 intrinsics */ #if __ARM_FEATURE_CRC32 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) diff --git a/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h b/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h index 13bcbef..576f761 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h @@ -832,7 +832,8 @@ _mm256_xor_si256(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_stream_load_si256(__m256i const *__V) { - return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V); + typedef __v4di __v4di_aligned __attribute__((aligned(32))); + return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); } static __inline__ __m128 __DEFAULT_FN_ATTRS diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h index 629dc86..41958b7 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h @@ -504,115 +504,91 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_packs_epi32 (__m512i __A, __m512i __B) +_mm512_packs_epi32(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B) +_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) _mm512_setzero_hi(), - __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_packs_epi32(__A, __B), + (__v32hi)_mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) __W, - __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_packs_epi32(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_packs_epi16 (__m512i __A, __m512i __B) +_mm512_packs_epi16(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_packs_epi16(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B) +_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) _mm512_setzero_qi(), - __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_packs_epi16(__A, __B), + (__v64qi)_mm512_setzero_qi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_packus_epi32 (__m512i __A, __m512i __B) +_mm512_packus_epi32(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B) +_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) _mm512_setzero_hi(), - __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_packus_epi32(__A, __B), + (__v32hi)_mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, - (__v16si) __B, - (__v32hi) __W, - __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_packus_epi32(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_packus_epi16 (__m512i __A, __m512i __B) +_mm512_packus_epi16(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_packus_epi16(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B) +_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_packus_epi16(__A, __B), + (__v64qi)_mm512_setzero_qi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512dqintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512dqintrin.h index ae44b98..4fd1add 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512dqintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512dqintrin.h @@ -995,51 +995,50 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_broadcast_f32x8 (__m256 __A) +_mm512_broadcast_f32x8(__m256 __A) { - return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, - _mm512_undefined_ps(), - (__mmask16) -1); + return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A, + 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A) +_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) { - return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, - (__v16sf)__O, - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask8)__M, + (__v16sf)_mm512_broadcast_f32x8(__A), + (__v16sf)__O); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A) +_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { - return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, - (__v16sf)_mm512_setzero_ps (), - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask8)__M, + (__v16sf)_mm512_broadcast_f32x8(__A), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_broadcast_f64x2 (__m128d __A) +_mm512_broadcast_f64x2(__m128d __A) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, - (__v8df)_mm512_undefined_pd(), - (__mmask8) -1); + return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A) +_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, - (__v8df) - __O, __M); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x2(__A), + (__v8df)__O); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) +_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, - (__v8df)_mm512_setzero_ps (), - __M); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x2(__A), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1067,52 +1066,50 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcast_i32x8 (__m256i __A) +_mm512_broadcast_i32x8(__m256i __A) { - return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, - (__v16si)_mm512_setzero_si512(), - (__mmask16) -1); + return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A, + 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A) +_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) { - return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, - (__v16si)__O, - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M, + (__v16si)_mm512_broadcast_i32x8(__A), + (__v16si)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A) +_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { - return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M, + (__v16si)_mm512_broadcast_i32x8(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcast_i64x2 (__m128i __A) +_mm512_broadcast_i64x2(__m128i __A) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, - (__v8di)_mm512_setzero_si512(), - (__mmask8) -1); + return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A) +_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, - (__v8di) - __O, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x2(__A), + (__v8di)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) +_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, - (__v8di)_mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x2(__A), + (__v8di)_mm512_setzero_si512()); } #define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \ diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h index e6a7217..4ce6945 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h @@ -528,6 +528,116 @@ _mm512_mask2int(__mmask16 __a) return (int)__a; } +/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a +/// 128-bit floating-point vector of [2 x double]. The lower 128 bits +/// contain the value of the source vector. The upper 384 bits are set +/// to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits +/// contain the value of the parameter. The upper 384 bits are set to zero. +static __inline __m512d __DEFAULT_FN_ATTRS +_mm512_zextpd128_pd512(__m128d __a) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3); +} + +/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a +/// 256-bit floating-point vector of [4 x double]. The lower 256 bits +/// contain the value of the source vector. The upper 256 bits are set +/// to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits +/// contain the value of the parameter. The upper 256 bits are set to zero. +static __inline __m512d __DEFAULT_FN_ATTRS +_mm512_zextpd256_pd512(__m256d __a) +{ + return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7); +} + +/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a +/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain +/// the value of the source vector. The upper 384 bits are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits +/// contain the value of the parameter. The upper 384 bits are set to zero. +static __inline __m512 __DEFAULT_FN_ATTRS +_mm512_zextps128_ps512(__m128 __a) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7); +} + +/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a +/// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain +/// the value of the source vector. The upper 256 bits are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits +/// contain the value of the parameter. The upper 256 bits are set to zero. +static __inline __m512 __DEFAULT_FN_ATTRS +_mm512_zextps256_ps512(__m256 __a) +{ + return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +/// \brief Constructs a 512-bit integer vector from a 128-bit integer vector. +/// The lower 128 bits contain the value of the source vector. The upper +/// 384 bits are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 512-bit integer vector. The lower 128 bits contain the value of +/// the parameter. The upper 384 bits are set to zero. +static __inline __m512i __DEFAULT_FN_ATTRS +_mm512_zextsi128_si512(__m128i __a) +{ + return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3); +} + +/// \brief Constructs a 512-bit integer vector from a 256-bit integer vector. +/// The lower 256 bits contain the value of the source vector. The upper +/// 256 bits are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 512-bit integer vector. The lower 256 bits contain the value of +/// the parameter. The upper 256 bits are set to zero. +static __inline __m512i __DEFAULT_FN_ATTRS +_mm512_zextsi256_si512(__m256i __a) +{ + return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7); +} + /* Bitwise operators */ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_and_epi32(__m512i __a, __m512i __b) @@ -4179,7 +4289,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) { return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, - (__v16si) + (__v16si) _mm512_setzero_si512 (), (__mmask16) __U , _MM_FROUND_CUR_DIRECTION); @@ -4229,6 +4339,18 @@ _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } +static __inline__ double __DEFAULT_FN_ATTRS +_mm512_cvtsd_f64(__m512d __a) +{ + return __a[0]; +} + +static __inline__ float __DEFAULT_FN_ATTRS +_mm512_cvtss_f32(__m512 __a) +{ + return __a[0]; +} + /* Unpack and Interleave */ static __inline __m512d __DEFAULT_FN_ATTRS @@ -4540,7 +4662,7 @@ _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) } static __inline __m512d __DEFAULT_FN_ATTRS -_mm512_loadu_pd(double const *__p) +_mm512_loadu_pd(void const *__p) { struct __loadu_pd { __m512d __v; @@ -4549,7 +4671,7 @@ _mm512_loadu_pd(double const *__p) } static __inline __m512 __DEFAULT_FN_ATTRS -_mm512_loadu_ps(float const *__p) +_mm512_loadu_ps(void const *__p) { struct __loadu_ps { __m512 __v; @@ -4558,7 +4680,7 @@ _mm512_loadu_ps(float const *__p) } static __inline __m512 __DEFAULT_FN_ATTRS -_mm512_load_ps(float const *__p) +_mm512_load_ps(void const *__p) { return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p, (__v16sf) @@ -4584,7 +4706,7 @@ _mm512_maskz_load_ps(__mmask16 __U, void const *__P) } static __inline __m512d __DEFAULT_FN_ATTRS -_mm512_load_pd(double const *__p) +_mm512_load_pd(void const *__p) { return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p, (__v8df) @@ -7278,107 +7400,97 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) (__mmask8)(U), (int)(R)); }) static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_broadcast_f32x4 (__m128 __A) +_mm512_broadcast_f32x4(__m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, + 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A) +_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, - (__v16sf) __O, - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x4(__A), + (__v16sf)__O); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A) +_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, - (__v16sf) - _mm512_setzero_ps (), - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x4(__A), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_broadcast_f64x4 (__m256d __A) +_mm512_broadcast_f64x4(__m256d __A) { - return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A) +_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) { - return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, - (__v8df) __O, - __M); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x4(__A), + (__v8df)__O); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A) +_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) { - return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, - (__v8df) - _mm512_setzero_pd (), - __M); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x4(__A), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcast_i32x4 (__m128i __A) +_mm512_broadcast_i32x4(__m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, + 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A) +_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, - (__v16si) __O, - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x4(__A), + (__v16si)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A) +_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x4(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcast_i64x4 (__m256i __A) +_mm512_broadcast_i64x4(__m256i __A) { - return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A) +_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) { - return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, - (__v8di) __O, - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x4(__A), + (__v8di)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A) +_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) { - return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x4(__A), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512d __DEFAULT_FN_ATTRS @@ -7860,12 +7972,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) 3 + ((imm) & 0x3) * 4); }) #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ - (__v4si)__W); }) + (__v4si)(W)); }) #define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ (__v4si)_mm_setzero_si128()); }) @@ -7878,12 +7990,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) ((imm) & 1) ? 7 : 3); }) #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ - (__v4di)__W); }) + (__v4di)(W)); }) #define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ (__v4di)_mm256_setzero_si256()); }) @@ -8159,11 +8271,11 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) (__v8di)(__m512i)(index), (__mmask8)-1, \ (int)(scale)); }) -#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\ - __addr, __scale) __extension__({\ -__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\ - __addr,(__v8di) __index, __mask, __scale);\ -}) +#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__({\ + (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ + (float const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)(mask), (int)(scale)); }) #define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\ (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \ @@ -8858,6 +8970,8 @@ _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) (__mmask16) -1); } +#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 + static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, __m512i __Y) @@ -8868,6 +8982,8 @@ _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, __M); } +#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 + static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kand (__mmask16 __A, __mmask16 __B) { @@ -8919,25 +9035,29 @@ _mm512_kxor (__mmask16 __A, __mmask16 __B) static __inline__ void __DEFAULT_FN_ATTRS _mm512_stream_si512 (__m512i * __P, __m512i __A) { - __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P); + typedef __v8di __v8di_aligned __attribute__((aligned(64))); + __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_stream_load_si512 (void *__P) { - return __builtin_ia32_movntdqa512 ((__v8di *)__P); + typedef __v8di __v8di_aligned __attribute__((aligned(64))); + return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); } static __inline__ void __DEFAULT_FN_ATTRS _mm512_stream_pd (double *__P, __m512d __A) { - __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P); + typedef __v8df __v8df_aligned __attribute__((aligned(64))); + __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P); } static __inline__ void __DEFAULT_FN_ATTRS _mm512_stream_ps (float *__P, __m512 __A) { - __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P); + typedef __v16sf __v16sf_aligned __attribute__((aligned(64))); + __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P); } static __inline__ __m512d __DEFAULT_FN_ATTRS @@ -9101,39 +9221,39 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - __m128 res = __A; + __m128 res = __A; res[0] = (__U & 1) ? __B[0] : __W[0]; - return res; + return res; } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) { - __m128 res = __A; - res[0] = (__U & 1) ? __B[0] : 0; - return res; + __m128 res = __A; + res[0] = (__U & 1) ? __B[0] : 0; + return res; } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - __m128d res = __A; + __m128d res = __A; res[0] = (__U & 1) ? __B[0] : __W[0]; - return res; + return res; } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) { - __m128d res = __A; - res[0] = (__U & 1) ? __B[0] : 0; - return res; + __m128d res = __A; + res[0] = (__U & 1) ? __B[0] : 0; + return res; } static __inline__ void __DEFAULT_FN_ATTRS _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) { - __builtin_ia32_storess128_mask ((__v16sf *)__W, + __builtin_ia32_storess128_mask ((__v16sf *)__W, (__v16sf) _mm512_castps128_ps512(__A), (__mmask16) __U & (__mmask16)1); } @@ -9141,7 +9261,7 @@ _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) static __inline__ void __DEFAULT_FN_ATTRS _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) { - __builtin_ia32_storesd128_mask ((__v8df *)__W, + __builtin_ia32_storesd128_mask ((__v8df *)__W, (__v8df) _mm512_castpd128_pd512(__A), (__mmask8) __U & 1); } @@ -9490,7 +9610,7 @@ _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) { return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A), (__v2df)(__B), - (__v4sf)(__W), + (__v4sf)(__W), (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); } @@ -9499,7 +9619,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) { return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A), (__v2df)(__B), - (__v4sf)_mm_setzero_ps(), + (__v4sf)_mm_setzero_ps(), (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); } @@ -9564,7 +9684,7 @@ _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A), (__v4sf)(__B), (__v2df)(__W), - (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); + (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); } static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -9572,8 +9692,8 @@ _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B) { return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A), (__v4sf)(__B), - (__v2df)_mm_setzero_pd(), - (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); + (__v2df)_mm_setzero_pd(), + (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); } static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -9635,6 +9755,45 @@ _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) } #endif +static __inline __m512i __DEFAULT_FN_ATTRS +_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, + char __e58, char __e57, char __e56, char __e55, char __e54, char __e53, + char __e52, char __e51, char __e50, char __e49, char __e48, char __e47, + char __e46, char __e45, char __e44, char __e43, char __e42, char __e41, + char __e40, char __e39, char __e38, char __e37, char __e36, char __e35, + char __e34, char __e33, char __e32, char __e31, char __e30, char __e29, + char __e28, char __e27, char __e26, char __e25, char __e24, char __e23, + char __e22, char __e21, char __e20, char __e19, char __e18, char __e17, + char __e16, char __e15, char __e14, char __e13, char __e12, char __e11, + char __e10, char __e9, char __e8, char __e7, char __e6, char __e5, + char __e4, char __e3, char __e2, char __e1, char __e0) { + + return __extension__ (__m512i)(__v64qi) + {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, + __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, + __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, + __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31, + __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39, + __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47, + __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55, + __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63}; +} + +static __inline __m512i __DEFAULT_FN_ATTRS +_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, + short __e27, short __e26, short __e25, short __e24, short __e23, + short __e22, short __e21, short __e20, short __e19, short __e18, + short __e17, short __e16, short __e15, short __e14, short __e13, + short __e12, short __e11, short __e10, short __e9, short __e8, + short __e7, short __e6, short __e5, short __e4, short __e3, + short __e2, short __e1, short __e0) { + return __extension__ (__m512i)(__v32hi) + {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, + __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, + __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, + __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 }; +} + static __inline __m512i __DEFAULT_FN_ATTRS _mm512_set_epi32 (int __A, int __B, int __C, int __D, int __E, int __F, int __G, int __H, @@ -9780,7 +9939,7 @@ static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) { } // Vec512 - Vector with size 512. -// Vec512Neutral - All vector elements set to the identity element. +// Vec512Neutral - All vector elements set to the identity element. // Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0} // Operator - Can be one of following: +,*,&,| // Mask - Intrinsic Mask @@ -9810,19 +9969,19 @@ _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { static __inline__ long long __DEFAULT_FN_ATTRS _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), &, __M, i, i, q); } static __inline__ long long __DEFAULT_FN_ATTRS _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, i, i, q); } static __inline__ double __DEFAULT_FN_ATTRS _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, + _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, f, d, pd); } @@ -9884,17 +10043,17 @@ _mm512_reduce_add_epi32(__m512i __W) { _mm512_reduce_operator_32bit(__W, +, i, i); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi32(__m512i __W) { _mm512_reduce_operator_32bit(__W, *, i, i); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_and_epi32(__m512i __W) { _mm512_reduce_operator_32bit(__W, &, i, i); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_or_epi32(__m512i __W) { _mm512_reduce_operator_32bit(__W, |, i, i); } @@ -9910,7 +10069,7 @@ _mm512_reduce_mul_ps(__m512 __W) { } // Vec512 - Vector with size 512. -// Vec512Neutral - All vector elements set to the identity element. +// Vec512Neutral - All vector elements set to the identity element. // Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0} // Operator - Can be one of following: +,*,&,| // Mask - Intrinsic Mask @@ -9940,7 +10099,7 @@ _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { static __inline__ int __DEFAULT_FN_ATTRS _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, + _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, i, i, d); } @@ -10003,7 +10162,7 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { return Vec512[0]; \ }) -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_max_epi64(__m512i __V) { _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i); } @@ -10013,7 +10172,7 @@ _mm512_reduce_max_epu64(__m512i __V) { _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i); } -static __inline__ double __DEFAULT_FN_ATTRS +static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_max_pd(__m512d __V) { _mm512_reduce_maxMin_64bit(__V, max_pd, d, f); } @@ -10028,7 +10187,7 @@ _mm512_reduce_min_epu64(__m512i __V) { _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i); } -static __inline__ double __DEFAULT_FN_ATTRS +static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_min_pd(__m512d __V) { _mm512_reduce_maxMin_64bit(__V, min_pd, d, f); } diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512vldqintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512vldqintrin.h index cd9da43..aecd7df 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512vldqintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512vldqintrin.h @@ -1000,27 +1000,26 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_broadcast_f64x2 (__m128d __A) +_mm256_broadcast_f64x2(__m128d __A) { - return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A, - (__v4df)_mm256_undefined_pd(), - (__mmask8) -1); + return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, + 0, 1, 0, 1); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A) +_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) { - return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A, - (__v4df) __O, - __M); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, + (__v4df)_mm256_broadcast_f64x2(__A), + (__v4df)__O); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) { - return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A, - (__v4df) _mm256_setzero_ps (), - __M); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, + (__v4df)_mm256_broadcast_f64x2(__A), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -1072,27 +1071,26 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_broadcast_i64x2 (__m128i __A) +_mm256_broadcast_i64x2(__m128i __A) { - return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A, - (__v4di)_mm256_undefined_si256(), - (__mmask8) -1); + return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, + 0, 1, 0, 1); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A) +_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A, - (__v4di) __O, - __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_broadcast_i64x2(__A), + (__v4di)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A, - (__v4di) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_broadcast_i64x2(__A), + (__v4di)_mm256_setzero_si256()); } #define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \ diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512vlintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512vlintrin.h index f3744da..99bb050 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512vlintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512vlintrin.h @@ -7189,52 +7189,49 @@ _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_broadcast_f32x4 (__m128 __A) +_mm256_broadcast_f32x4(__m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, - (__v8sf)_mm256_undefined_pd (), - (__mmask8) -1); + return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A) +_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, - (__v8sf) __O, - __M); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x4(__A), + (__v8sf)__O); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, - (__v8sf) _mm256_setzero_ps (), - __M); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x4(__A), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_broadcast_i32x4 (__m128i __A) +_mm256_broadcast_i32x4(__m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A, - (__v8si)_mm256_undefined_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A) +_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A, - (__v8si) - __O, __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x4(__A), + (__v8si)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A) +_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) - __A, - (__v8si) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x4(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256d __DEFAULT_FN_ATTRS diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512vpopcntdqintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512vpopcntdqintrin.h new file mode 100644 index 0000000..34ab849 --- /dev/null +++ b/contrib/llvm/tools/clang/lib/Headers/avx512vpopcntdqintrin.h @@ -0,0 +1,70 @@ +/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics + *------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512VPOPCNTDQINTRIN_H +#define __AVX512VPOPCNTDQINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntd" \ + "q"))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) { + return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_selectq_512( + (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { + return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) { + return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { + return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/contrib/llvm/tools/clang/lib/Headers/avxintrin.h b/contrib/llvm/tools/clang/lib/Headers/avxintrin.h index be03ba3..dff5897 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avxintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avxintrin.h @@ -1458,12 +1458,13 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Computes two dot products in parallel, using the lower and upper /// halves of two [8 x float] vectors as input to the two computations, and /// returning the two dot products in the lower and upper halves of the -/// [8 x float] result. The immediate integer operand controls which input -/// elements will contribute to the dot product, and where the final results -/// are returned. In general, for each dot product, the four corresponding -/// elements of the input vectors are multiplied; the first two and second -/// two products are summed, then the two sums are added to form the final -/// result. +/// [8 x float] result. +/// +/// The immediate integer operand controls which input elements will +/// contribute to the dot product, and where the final results are returned. +/// In general, for each dot product, the four corresponding elements of the +/// input vectors are multiplied; the first two and second two products are +/// summed, then the two sums are added to form the final result. /// /// \headerfile <x86intrin.h> /// @@ -1497,15 +1498,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /* Vector shuffle */ /// \brief Selects 8 float values from the 256-bit operands of [8 x float], as -/// specified by the immediate value operand. The four selected elements in -/// each operand are copied to the destination according to the bits -/// specified in the immediate operand. The selected elements from the first -/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the -/// destination, and the selected elements from the second 256-bit operand -/// are copied to bits [127:64] and bits [255:192] of the destination. For -/// example, if bits [7:0] of the immediate operand contain a value of 0xFF, -/// the 256-bit destination vector would contain the following values: b[7], -/// b[7], a[7], a[7], b[3], b[3], a[3], a[3]. +/// specified by the immediate value operand. +/// +/// The four selected elements in each operand are copied to the destination +/// according to the bits specified in the immediate operand. The selected +/// elements from the first 256-bit operand are copied to bits [63:0] and +/// bits [191:128] of the destination, and the selected elements from the +/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of +/// the destination. For example, if bits [7:0] of the immediate operand +/// contain a value of 0xFF, the 256-bit destination vector would contain the +/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. /// /// \headerfile <x86intrin.h> /// @@ -1557,13 +1559,14 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 12 + (((mask) >> 6) & 0x3)); }) /// \brief Selects four double-precision values from the 256-bit operands of -/// [4 x double], as specified by the immediate value operand. The selected -/// elements from the first 256-bit operand are copied to bits [63:0] and -/// bits [191:128] in the destination, and the selected elements from the -/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in -/// the destination. For example, if bits [3:0] of the immediate operand -/// contain a value of 0xF, the 256-bit destination vector would contain the -/// following values: b[3], a[3], b[1], a[1]. +/// [4 x double], as specified by the immediate value operand. +/// +/// The selected elements from the first 256-bit operand are copied to bits +/// [63:0] and bits [191:128] in the destination, and the selected elements +/// from the second 256-bit operand are copied to bits [127:64] and bits +/// [255:192] in the destination. For example, if bits [3:0] of the immediate +/// operand contain a value of 0xF, the 256-bit destination vector would +/// contain the following values: b[3], a[3], b[1], a[1]. /// /// \headerfile <x86intrin.h> /// @@ -1613,9 +1616,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ -#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ +#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ -#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ +#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ @@ -1628,10 +1631,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ -#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ +#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ -#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ +#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ @@ -1641,9 +1644,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding double-precision values of two /// 128-bit vectors of [2 x double], using the operation specified by the -/// immediate integer operand. Returns a [2 x double] vector consisting of -/// two doubles corresponding to the two comparison results: zero if the -/// comparison is false, and all 1's if the comparison is true. +/// immediate integer operand. +/// +/// Returns a [2 x double] vector consisting of two doubles corresponding to +/// the two comparison results: zero if the comparison is false, and all 1's +/// if the comparison is true. /// /// \headerfile <x86intrin.h> /// @@ -1660,17 +1665,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_pd(a, b, c) __extension__ ({ \ (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ @@ -1678,9 +1704,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding values of two 128-bit vectors of /// [4 x float], using the operation specified by the immediate integer -/// operand. Returns a [4 x float] vector consisting of four floats -/// corresponding to the four comparison results: zero if the comparison is -/// false, and all 1's if the comparison is true. +/// operand. +/// +/// Returns a [4 x float] vector consisting of four floats corresponding to +/// the four comparison results: zero if the comparison is false, and all 1's +/// if the comparison is true. /// /// \headerfile <x86intrin.h> /// @@ -1697,17 +1725,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ps(a, b, c) __extension__ ({ \ (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ @@ -1715,9 +1764,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding double-precision values of two /// 256-bit vectors of [4 x double], using the operation specified by the -/// immediate integer operand. Returns a [4 x double] vector consisting of -/// four doubles corresponding to the four comparison results: zero if the -/// comparison is false, and all 1's if the comparison is true. +/// immediate integer operand. +/// +/// Returns a [4 x double] vector consisting of four doubles corresponding to +/// the four comparison results: zero if the comparison is false, and all 1's +/// if the comparison is true. /// /// \headerfile <x86intrin.h> /// @@ -1734,17 +1785,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 256-bit vector of [4 x double] containing the comparison results. #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ @@ -1752,9 +1824,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding values of two 256-bit vectors of /// [8 x float], using the operation specified by the immediate integer -/// operand. Returns a [8 x float] vector consisting of eight floats -/// corresponding to the eight comparison results: zero if the comparison is -/// false, and all 1's if the comparison is true. +/// operand. +/// +/// Returns a [8 x float] vector consisting of eight floats corresponding to +/// the eight comparison results: zero if the comparison is false, and all +/// 1's if the comparison is true. /// /// \headerfile <x86intrin.h> /// @@ -1771,17 +1845,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 256-bit vector of [8 x float] containing the comparison results. #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ @@ -1789,8 +1884,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding scalar double-precision values of /// two 128-bit vectors of [2 x double], using the operation specified by the -/// immediate integer operand. If the result is true, all 64 bits of the -/// destination vector are set; otherwise they are cleared. +/// immediate integer operand. +/// +/// If the result is true, all 64 bits of the destination vector are set; +/// otherwise they are cleared. /// /// \headerfile <x86intrin.h> /// @@ -1807,17 +1904,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_sd(a, b, c) __extension__ ({ \ (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ @@ -1825,8 +1943,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding scalar values of two 128-bit /// vectors of [4 x float], using the operation specified by the immediate -/// integer operand. If the result is true, all 32 bits of the destination -/// vector are set; otherwise they are cleared. +/// integer operand. +/// +/// If the result is true, all 32 bits of the destination vector are set; +/// otherwise they are cleared. /// /// \headerfile <x86intrin.h> /// @@ -1843,17 +1963,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ss(a, b, c) __extension__ ({ \ (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ @@ -2184,12 +2325,32 @@ _mm256_cvttps_epi32(__m256 __a) return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); } +/// \brief Returns the first element of the input vector of [4 x double]. +/// +/// \headerfile <avxintrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 64 bit double containing the first element of the input vector. static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a) { return __a[0]; } +/// \brief Returns the first element of the input vector of [8 x i32]. +/// +/// \headerfile <avxintrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \returns A 32 bit integer containing the first element of the input vector. static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a) { @@ -2197,6 +2358,16 @@ _mm256_cvtsi256_si32(__m256i __a) return __b[0]; } +/// \brief Returns the first element of the input vector of [8 x float]. +/// +/// \headerfile <avxintrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 32 bit float containing the first element of the input vector. static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a) { @@ -2380,7 +2551,9 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b) /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an /// element-by-element comparison of the double-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2407,7 +2580,9 @@ _mm_testz_pd(__m128d __a, __m128d __b) /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an /// element-by-element comparison of the double-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2434,7 +2609,9 @@ _mm_testc_pd(__m128d __a, __m128d __b) /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an /// element-by-element comparison of the double-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2462,7 +2639,9 @@ _mm_testnzc_pd(__m128d __a, __m128d __b) /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2489,7 +2668,9 @@ _mm_testz_ps(__m128 __a, __m128 __b) /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2516,7 +2697,9 @@ _mm_testc_ps(__m128 __a, __m128 __b) /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2544,7 +2727,9 @@ _mm_testnzc_ps(__m128 __a, __m128 __b) /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an /// element-by-element comparison of the double-precision elements in the /// first source vector and the corresponding elements in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2571,7 +2756,9 @@ _mm256_testz_pd(__m256d __a, __m256d __b) /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an /// element-by-element comparison of the double-precision elements in the /// first source vector and the corresponding elements in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2598,7 +2785,9 @@ _mm256_testc_pd(__m256d __a, __m256d __b) /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an /// element-by-element comparison of the double-precision elements in the /// first source vector and the corresponding elements in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2626,7 +2815,9 @@ _mm256_testnzc_pd(__m256d __a, __m256d __b) /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2653,7 +2844,9 @@ _mm256_testz_ps(__m256 __a, __m256 __b) /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2680,7 +2873,9 @@ _mm256_testc_ps(__m256 __a, __m256 __b) /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an /// element-by-element comparison of the single-precision elements in the /// first source vector and the corresponding elements in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2706,7 +2901,9 @@ _mm256_testnzc_ps(__m256 __a, __m256 __b) } /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison -/// of the two source vectors and update the EFLAGS register as follows: \n +/// of the two source vectors. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of bits where both bits are 1, the ZF flag /// is set to 0. Otherwise the ZF flag is set to 1. \n /// If there is at least one pair of bits where the bit from the first source @@ -2730,7 +2927,9 @@ _mm256_testz_si256(__m256i __a, __m256i __b) } /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison -/// of the two source vectors and update the EFLAGS register as follows: \n +/// of the two source vectors. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of bits where both bits are 1, the ZF flag /// is set to 0. Otherwise the ZF flag is set to 1. \n /// If there is at least one pair of bits where the bit from the first source @@ -2754,7 +2953,9 @@ _mm256_testc_si256(__m256i __a, __m256i __b) } /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison -/// of the two source vectors and update the EFLAGS register as follows: \n +/// of the two source vectors. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of bits where both bits are 1, the ZF flag /// is set to 0. Otherwise the ZF flag is set to 1. \n /// If there is at least one pair of bits where the bit from the first source @@ -3389,7 +3590,8 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(__m256i *__a, __m256i __b) { - __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); + typedef __v4di __v4di_aligned __attribute__((aligned(32))); + __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); } /// \brief Moves double-precision values from a 256-bit vector of [4 x double] @@ -3402,13 +3604,14 @@ _mm256_stream_si256(__m256i *__a, __m256i __b) /// /// \param __a /// A pointer to a 32-byte aligned memory location that will receive the -/// integer values. +/// double-precision floating-point values. /// \param __b /// A 256-bit vector of [4 x double] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(double *__a, __m256d __b) { - __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); + typedef __v4df __v4df_aligned __attribute__((aligned(32))); + __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); } /// \brief Moves single-precision floating point values from a 256-bit vector @@ -3428,7 +3631,8 @@ _mm256_stream_pd(double *__a, __m256d __b) static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(float *__p, __m256 __a) { - __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p); + typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); + __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); } /* Create vectors */ @@ -4310,9 +4514,10 @@ _mm256_castsi256_si128(__m256i __a) } /// \brief Constructs a 256-bit floating-point vector of [4 x double] from a -/// 128-bit floating-point vector of [2 x double]. The lower 128 bits -/// contain the value of the source vector. The contents of the upper 128 -/// bits are undefined. +/// 128-bit floating-point vector of [2 x double]. +/// +/// The lower 128 bits contain the value of the source vector. The contents +/// of the upper 128 bits are undefined. /// /// \headerfile <x86intrin.h> /// @@ -4330,9 +4535,10 @@ _mm256_castpd128_pd256(__m128d __a) } /// \brief Constructs a 256-bit floating-point vector of [8 x float] from a -/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain -/// the value of the source vector. The contents of the upper 128 bits are -/// undefined. +/// 128-bit floating-point vector of [4 x float]. +/// +/// The lower 128 bits contain the value of the source vector. The contents +/// of the upper 128 bits are undefined. /// /// \headerfile <x86intrin.h> /// @@ -4350,6 +4556,7 @@ _mm256_castps128_ps256(__m128 __a) } /// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. +/// /// The lower 128 bits contain the value of the source vector. The contents /// of the upper 128 bits are undefined. /// @@ -4367,6 +4574,61 @@ _mm256_castsi128_si256(__m128i __a) return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); } +/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a +/// 128-bit floating-point vector of [2 x double]. The lower 128 bits +/// contain the value of the source vector. The upper 128 bits are set +/// to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits +/// contain the value of the parameter. The upper 128 bits are set to zero. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_zextpd128_pd256(__m128d __a) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); +} + +/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a +/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain +/// the value of the source vector. The upper 128 bits are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits +/// contain the value of the parameter. The upper 128 bits are set to zero. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_zextps128_ps256(__m128 __a) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); +} + +/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. +/// The lower 128 bits contain the value of the source vector. The upper +/// 128 bits are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 256-bit integer vector. The lower 128 bits contain the value of +/// the parameter. The upper 128 bits are set to zero. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_zextsi128_si256(__m128i __a) +{ + return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); +} + /* Vector insert. We use macros rather than inlines because we only want to accept @@ -4375,8 +4637,10 @@ _mm256_castsi128_si256(__m128i __a) /// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating /// a 256-bit vector of [8 x float] given in the first parameter, and then /// replacing either the upper or the lower 128 bits with the contents of a -/// 128-bit vector of [4 x float] in the second parameter. The immediate -/// integer parameter determines between the upper or the lower 128 bits. +/// 128-bit vector of [4 x float] in the second parameter. +/// +/// The immediate integer parameter determines between the upper or the lower +/// 128 bits. /// /// \headerfile <x86intrin.h> /// @@ -4420,8 +4684,10 @@ _mm256_castsi128_si256(__m128i __a) /// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating /// a 256-bit vector of [4 x double] given in the first parameter, and then /// replacing either the upper or the lower 128 bits with the contents of a -/// 128-bit vector of [2 x double] in the second parameter. The immediate -/// integer parameter determines between the upper or the lower 128 bits. +/// 128-bit vector of [2 x double] in the second parameter. +/// +/// The immediate integer parameter determines between the upper or the lower +/// 128 bits. /// /// \headerfile <x86intrin.h> /// @@ -4461,8 +4727,10 @@ _mm256_castsi128_si256(__m128i __a) /// \brief Constructs a new 256-bit integer vector by first duplicating a /// 256-bit integer vector given in the first parameter, and then replacing /// either the upper or the lower 128 bits with the contents of a 128-bit -/// integer vector in the second parameter. The immediate integer parameter -/// determines between the upper or the lower 128 bits. +/// integer vector in the second parameter. +/// +/// The immediate integer parameter determines between the upper or the lower +/// 128 bits. /// /// \headerfile <x86intrin.h> /// diff --git a/contrib/llvm/tools/clang/lib/Headers/bmiintrin.h b/contrib/llvm/tools/clang/lib/Headers/bmiintrin.h index 488eb2d..e812a16 100644 --- a/contrib/llvm/tools/clang/lib/Headers/bmiintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/bmiintrin.h @@ -28,107 +28,17 @@ #ifndef __BMIINTRIN_H #define __BMIINTRIN_H -/// \brief Counts the number of trailing zero bits in the operand. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned short _tzcnt_u16(unsigned short a); -/// \endcode -/// -/// This intrinsic corresponds to the <c> TZCNT </c> instruction. -/// -/// \param a -/// An unsigned 16-bit integer whose trailing zeros are to be counted. -/// \returns An unsigned 16-bit integer containing the number of trailing zero -/// bits in the operand. #define _tzcnt_u16(a) (__tzcnt_u16((a))) -/// \brief Performs a bitwise AND of the second operand with the one's -/// complement of the first operand. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned int _andn_u32(unsigned int a, unsigned int b); -/// \endcode -/// -/// This intrinsic corresponds to the <c> ANDN </c> instruction. -/// -/// \param a -/// An unsigned integer containing one of the operands. -/// \param b -/// An unsigned integer containing one of the operands. -/// \returns An unsigned integer containing the bitwise AND of the second -/// operand with the one's complement of the first operand. #define _andn_u32(a, b) (__andn_u32((a), (b))) /* _bextr_u32 != __bextr_u32 */ -/// \brief Clears all bits in the source except for the least significant bit -/// containing a value of 1 and returns the result. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned int _blsi_u32(unsigned int a); -/// \endcode -/// -/// This intrinsic corresponds to the <c> BLSI </c> instruction. -/// -/// \param a -/// An unsigned integer whose bits are to be cleared. -/// \returns An unsigned integer containing the result of clearing the bits from -/// the source operand. #define _blsi_u32(a) (__blsi_u32((a))) -/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and -/// including the least siginificant bit that is set to 1 in the source -/// operand and returns the result. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned int _blsmsk_u32(unsigned int a); -/// \endcode -/// -/// This intrinsic corresponds to the <c> BLSMSK </c> instruction. -/// -/// \param a -/// An unsigned integer used to create the mask. -/// \returns An unsigned integer containing the newly created mask. #define _blsmsk_u32(a) (__blsmsk_u32((a))) -/// \brief Clears the least siginificant bit that is set to 1 in the source -/// operand and returns the result. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned int _blsr_u32(unsigned int a); -/// \endcode -/// -/// This intrinsic corresponds to the <c> BLSR </c> instruction. -/// -/// \param a -/// An unsigned integer containing the operand to be cleared. -/// \returns An unsigned integer containing the result of clearing the source -/// operand. #define _blsr_u32(a) (__blsr_u32((a))) -/// \brief Counts the number of trailing zero bits in the operand. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned int _tzcnt_u32(unsigned int a); -/// \endcode -/// -/// This intrinsic corresponds to the <c> TZCNT </c> instruction. -/// -/// \param a -/// An unsigned 32-bit integer whose trailing zeros are to be counted. -/// \returns An unsigned 32-bit integer containing the number of trailing zero -/// bits in the operand. #define _tzcnt_u32(a) (__tzcnt_u32((a))) /* Define the default attributes for the functions in this file. */ @@ -238,7 +148,7 @@ __blsi_u32(unsigned int __X) } /// \brief Creates a mask whose bits are set to 1, using bit 0 up to and -/// including the least siginificant bit that is set to 1 in the source +/// including the least significant bit that is set to 1 in the source /// operand and returns the result. /// /// \headerfile <x86intrin.h> @@ -254,7 +164,7 @@ __blsmsk_u32(unsigned int __X) return __X ^ (__X - 1); } -/// \brief Clears the least siginificant bit that is set to 1 in the source +/// \brief Clears the least significant bit that is set to 1 in the source /// operand and returns the result. /// /// \headerfile <x86intrin.h> @@ -305,91 +215,15 @@ _mm_tzcnt_32(unsigned int __X) #ifdef __x86_64__ -/// \brief Performs a bitwise AND of the second operand with the one's -/// complement of the first operand. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b); -/// \endcode -/// -/// This intrinsic corresponds to the <c> ANDN </c> instruction. -/// -/// \param a -/// An unsigned 64-bit integer containing one of the operands. -/// \param b -/// An unsigned 64-bit integer containing one of the operands. -/// \returns An unsigned 64-bit integer containing the bitwise AND of the second -/// operand with the one's complement of the first operand. #define _andn_u64(a, b) (__andn_u64((a), (b))) /* _bextr_u64 != __bextr_u64 */ -/// \brief Clears all bits in the source except for the least significant bit -/// containing a value of 1 and returns the result. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned long long _blsi_u64(unsigned long long a); -/// \endcode -/// -/// This intrinsic corresponds to the <c> BLSI </c> instruction. -/// -/// \param a -/// An unsigned 64-bit integer whose bits are to be cleared. -/// \returns An unsigned 64-bit integer containing the result of clearing the -/// bits from the source operand. #define _blsi_u64(a) (__blsi_u64((a))) -/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and -/// including the least siginificant bit that is set to 1 in the source -/// operand and returns the result. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned long long _blsmsk_u64(unsigned long long a); -/// \endcode -/// -/// This intrinsic corresponds to the <c> BLSMSK </c> instruction. -/// -/// \param a -/// An unsigned 64-bit integer used to create the mask. -/// \returns A unsigned 64-bit integer containing the newly created mask. #define _blsmsk_u64(a) (__blsmsk_u64((a))) -/// \brief Clears the least siginificant bit that is set to 1 in the source -/// operand and returns the result. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned long long _blsr_u64(unsigned long long a); -/// \endcode -/// -/// This intrinsic corresponds to the <c> BLSR </c> instruction. -/// -/// \param a -/// An unsigned 64-bit integer containing the operand to be cleared. -/// \returns An unsigned 64-bit integer containing the result of clearing the -/// source operand. #define _blsr_u64(a) (__blsr_u64((a))) -/// \brief Counts the number of trailing zero bits in the operand. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// unsigned long long _tzcnt_u64(unsigned long long a); -/// \endcode -/// -/// This intrinsic corresponds to the <c> TZCNT </c> instruction. -/// -/// \param a -/// An unsigned 64-bit integer whose trailing zeros are to be counted. -/// \returns An unsigned 64-bit integer containing the number of trailing zero -/// bits in the operand. #define _tzcnt_u64(a) (__tzcnt_u64((a))) /// \brief Performs a bitwise AND of the second operand with the one's @@ -475,7 +309,7 @@ __blsi_u64(unsigned long long __X) } /// \brief Creates a mask whose bits are set to 1, using bit 0 up to and -/// including the least siginificant bit that is set to 1 in the source +/// including the least significant bit that is set to 1 in the source /// operand and returns the result. /// /// \headerfile <x86intrin.h> @@ -484,14 +318,14 @@ __blsi_u64(unsigned long long __X) /// /// \param __X /// An unsigned 64-bit integer used to create the mask. -/// \returns A unsigned 64-bit integer containing the newly created mask. +/// \returns An unsigned 64-bit integer containing the newly created mask. static __inline__ unsigned long long __DEFAULT_FN_ATTRS __blsmsk_u64(unsigned long long __X) { return __X ^ (__X - 1); } -/// \brief Clears the least siginificant bit that is set to 1 in the source +/// \brief Clears the least significant bit that is set to 1 in the source /// operand and returns the result. /// /// \headerfile <x86intrin.h> diff --git a/contrib/llvm/tools/clang/lib/Headers/clzerointrin.h b/contrib/llvm/tools/clang/lib/Headers/clzerointrin.h new file mode 100644 index 0000000..ed7478f --- /dev/null +++ b/contrib/llvm/tools/clang/lib/Headers/clzerointrin.h @@ -0,0 +1,50 @@ +/*===----------------------- clzerointrin.h - CLZERO ----------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __X86INTRIN_H +#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef _CLZEROINTRIN_H +#define _CLZEROINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("clzero"))) + +/// \brief Loads the cache line address and zero's out the cacheline +/// +/// \headerfile <clzerointrin.h> +/// +/// This intrinsic corresponds to the <c> CLZERO </c> instruction. +/// +/// \param __line +/// A pointer to a cacheline which needs to be zeroed out. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_clzero (void * __line) +{ + __builtin_ia32_clzero ((void *)__line); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* _CLZEROINTRIN_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/cpuid.h b/contrib/llvm/tools/clang/lib/Headers/cpuid.h index 400dcfa..2dd0add 100644 --- a/contrib/llvm/tools/clang/lib/Headers/cpuid.h +++ b/contrib/llvm/tools/clang/lib/Headers/cpuid.h @@ -79,7 +79,7 @@ #define signature_VORTEX_edx 0x36387865 #define signature_VORTEX_ecx 0x436f5320 -/* Features in %ecx for level 1 */ +/* Features in %ecx for leaf 1 */ #define bit_SSE3 0x00000001 #define bit_PCLMULQDQ 0x00000002 #define bit_PCLMUL bit_PCLMULQDQ /* for gcc compat */ @@ -114,7 +114,7 @@ #define bit_F16C 0x20000000 #define bit_RDRND 0x40000000 -/* Features in %edx for level 1 */ +/* Features in %edx for leaf 1 */ #define bit_FPU 0x00000001 #define bit_VME 0x00000002 #define bit_DE 0x00000004 @@ -147,44 +147,95 @@ #define bit_TM 0x20000000 #define bit_PBE 0x80000000 -/* Features in %ebx for level 7 sub-leaf 0 */ +/* Features in %ebx for leaf 7 sub-leaf 0 */ #define bit_FSGSBASE 0x00000001 +#define bit_SGX 0x00000004 +#define bit_BMI 0x00000008 +#define bit_HLE 0x00000010 +#define bit_AVX2 0x00000020 #define bit_SMEP 0x00000080 +#define bit_BMI2 0x00000100 #define bit_ENH_MOVSB 0x00000200 +#define bit_RTM 0x00000800 +#define bit_MPX 0x00004000 +#define bit_AVX512F 0x00010000 +#define bit_AVX512DQ 0x00020000 +#define bit_RDSEED 0x00040000 +#define bit_ADX 0x00080000 +#define bit_AVX512IFMA 0x00200000 +#define bit_CLFLUSHOPT 0x00800000 +#define bit_CLWB 0x01000000 +#define bit_AVX512PF 0x04000000 +#define bit_AVX51SER 0x08000000 +#define bit_AVX512CD 0x10000000 +#define bit_SHA 0x20000000 +#define bit_AVX512BW 0x40000000 +#define bit_AVX512VL 0x80000000 + +/* Features in %ecx for leaf 7 sub-leaf 0 */ +#define bit_PREFTCHWT1 0x00000001 +#define bit_AVX512VBMI 0x00000002 +#define bit_PKU 0x00000004 +#define bit_OSPKE 0x00000010 +#define bit_AVX512VPOPCNTDQ 0x00004000 +#define bit_RDPID 0x00400000 + +/* Features in %edx for leaf 7 sub-leaf 0 */ +#define bit_AVX5124VNNIW 0x00000004 +#define bit_AVX5124FMAPS 0x00000008 + +/* Features in %eax for leaf 13 sub-leaf 1 */ +#define bit_XSAVEOPT 0x00000001 +#define bit_XSAVEC 0x00000002 +#define bit_XSAVES 0x00000008 + +/* Features in %ecx for leaf 0x80000001 */ +#define bit_LAHF_LM 0x00000001 +#define bit_ABM 0x00000020 +#define bit_SSE4a 0x00000040 +#define bit_PRFCHW 0x00000100 +#define bit_XOP 0x00000800 +#define bit_LWP 0x00008000 +#define bit_FMA4 0x00010000 +#define bit_TBM 0x00200000 +#define bit_MWAITX 0x20000000 + +/* Features in %edx for leaf 0x80000001 */ +#define bit_MMXEXT 0x00400000 +#define bit_LM 0x20000000 +#define bit_3DNOWP 0x40000000 +#define bit_3DNOW 0x80000000 + +/* Features in %ebx for leaf 0x80000001 */ +#define bit_CLZERO 0x00000001 + #if __i386__ -#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \ +#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \ __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \ - : "0"(__level)) + : "0"(__leaf)) -#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \ +#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \ __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \ - : "0"(__level), "2"(__count)) + : "0"(__leaf), "2"(__count)) #else /* x86-64 uses %rbx as the base register, so preserve it. */ -#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \ +#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \ __asm(" xchgq %%rbx,%q1\n" \ " cpuid\n" \ " xchgq %%rbx,%q1" \ : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \ - : "0"(__level)) + : "0"(__leaf)) -#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \ +#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \ __asm(" xchgq %%rbx,%q1\n" \ " cpuid\n" \ " xchgq %%rbx,%q1" \ : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \ - : "0"(__level), "2"(__count)) + : "0"(__leaf), "2"(__count)) #endif -static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax, - unsigned int *__ebx, unsigned int *__ecx, - unsigned int *__edx) { - __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx); - return 1; -} - -static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig) +static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig) { unsigned int __eax, __ebx, __ecx, __edx; #if __i386__ @@ -208,8 +259,35 @@ static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig) return 0; #endif - __cpuid(__level, __eax, __ebx, __ecx, __edx); + __cpuid(__leaf, __eax, __ebx, __ecx, __edx); if (__sig) *__sig = __ebx; return __eax; } + +static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax, + unsigned int *__ebx, unsigned int *__ecx, + unsigned int *__edx) +{ + unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0); + + if (__max_leaf == 0 || __max_leaf < __leaf) + return 0; + + __cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} + +static __inline int __get_cpuid_count (unsigned int __leaf, + unsigned int __subleaf, + unsigned int *__eax, unsigned int *__ebx, + unsigned int *__ecx, unsigned int *__edx) +{ + unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0); + + if (__max_leaf == 0 || __max_leaf < __leaf) + return 0; + + __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} diff --git a/contrib/llvm/tools/clang/lib/Headers/emmintrin.h b/contrib/llvm/tools/clang/lib/Headers/emmintrin.h index 1512f9f..709815c 100644 --- a/contrib/llvm/tools/clang/lib/Headers/emmintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/emmintrin.h @@ -302,7 +302,7 @@ _mm_min_pd(__m128d __a, __m128d __b) return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); } -/// \brief Compares lower 64-bits double-precision values of both operands, and +/// \brief Compares lower 64-bit double-precision values of both operands, and /// returns the greater of the pair of values in the lower 64-bits of the /// result. The upper 64 bits of the result are copied from the upper double- /// precision value of the first operand. @@ -462,8 +462,9 @@ _mm_cmplt_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are less than or equal to those in the second operand. Each -/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// operand are less than or equal to those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -482,8 +483,9 @@ _mm_cmple_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are greater than those in the second operand. Each comparison -/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// operand are greater than those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -502,8 +504,9 @@ _mm_cmpgt_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are greater than or equal to those in the second operand. Each -/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// operand are greater than or equal to those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -522,9 +525,10 @@ _mm_cmpge_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are ordered with respect to those in the second operand. A pair -/// of double-precision values are "ordered" with respect to each other if -/// neither value is a NaN. Each comparison yields 0h for false, +/// operand are ordered with respect to those in the second operand. +/// +/// A pair of double-precision values are "ordered" with respect to each +/// other if neither value is a NaN. Each comparison yields 0h for false, /// FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> @@ -544,9 +548,10 @@ _mm_cmpord_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are unordered with respect to those in the second operand. A pair -/// of double-precision values are "unordered" with respect to each other if -/// one or both values are NaN. Each comparison yields 0h for false, +/// operand are unordered with respect to those in the second operand. +/// +/// A pair of double-precision values are "unordered" with respect to each +/// other if one or both values are NaN. Each comparison yields 0h for false, /// FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> @@ -567,8 +572,9 @@ _mm_cmpunord_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are unequal to those in the second operand. Each comparison -/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// operand are unequal to those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -587,8 +593,9 @@ _mm_cmpneq_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are not less than those in the second operand. Each comparison -/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// operand are not less than those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -607,8 +614,9 @@ _mm_cmpnlt_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are not less than or equal to those in the second operand. Each -/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// operand are not less than or equal to those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -627,8 +635,9 @@ _mm_cmpnle_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are not greater than those in the second operand. Each -/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// operand are not greater than those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -648,6 +657,7 @@ _mm_cmpngt_pd(__m128d __a, __m128d __b) /// \brief Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are not greater than or equal to those in the second operand. +/// /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> @@ -666,8 +676,9 @@ _mm_cmpnge_pd(__m128d __a, __m128d __b) } /// \brief Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] for equality. The -/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// the two 128-bit floating-point vectors of [2 x double] for equality. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -690,8 +701,9 @@ _mm_cmpeq_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than the corresponding value in -/// the second parameter. The comparison yields 0h for false, -/// FFFFFFFFFFFFFFFFh for true. +/// the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -714,8 +726,9 @@ _mm_cmplt_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than or equal to the -/// corresponding value in the second parameter. The comparison yields 0h for -/// false, FFFFFFFFFFFFFFFFh for true. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -738,8 +751,9 @@ _mm_cmple_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than the corresponding value -/// in the second parameter. The comparison yields 0h for false, -/// FFFFFFFFFFFFFFFFh for true. +/// in the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -763,8 +777,9 @@ _mm_cmpgt_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than or equal to the -/// corresponding value in the second parameter. The comparison yields 0h for -/// false, FFFFFFFFFFFFFFFFh for true. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -788,9 +803,11 @@ _mm_cmpge_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is "ordered" with respect to the -/// corresponding value in the second parameter. The comparison yields 0h for -/// false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are -/// "ordered" with respect to each other if neither value is a NaN. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of +/// double-precision values are "ordered" with respect to each other if +/// neither value is a NaN. /// /// \headerfile <x86intrin.h> /// @@ -813,9 +830,11 @@ _mm_cmpord_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is "unordered" with respect to the -/// corresponding value in the second parameter. The comparison yields 0h -/// for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values -/// are "unordered" with respect to each other if one or both values are NaN. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of +/// double-precision values are "unordered" with respect to each other if one +/// or both values are NaN. /// /// \headerfile <x86intrin.h> /// @@ -839,8 +858,9 @@ _mm_cmpunord_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is unequal to the corresponding value in -/// the second parameter. The comparison yields 0h for false, -/// FFFFFFFFFFFFFFFFh for true. +/// the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -863,8 +883,9 @@ _mm_cmpneq_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is not less than the corresponding -/// value in the second parameter. The comparison yields 0h for false, -/// FFFFFFFFFFFFFFFFh for true. +/// value in the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -887,8 +908,9 @@ _mm_cmpnlt_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is not less than or equal to the -/// corresponding value in the second parameter. The comparison yields 0h -/// for false, FFFFFFFFFFFFFFFFh for true. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -911,8 +933,9 @@ _mm_cmpnle_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is not greater than the corresponding -/// value in the second parameter. The comparison yields 0h for false, -/// FFFFFFFFFFFFFFFFh for true. +/// value in the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -936,8 +959,9 @@ _mm_cmpngt_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is not greater than or equal to the -/// corresponding value in the second parameter. The comparison yields 0h -/// for false, FFFFFFFFFFFFFFFFh for true. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -982,7 +1006,9 @@ _mm_comieq_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than the corresponding value in -/// the second parameter. The comparison yields 0 for false, 1 for true. +/// the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. /// /// \headerfile <x86intrin.h> /// @@ -1004,8 +1030,9 @@ _mm_comilt_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than or equal to the -/// corresponding value in the second parameter. The comparison yields 0 for -/// false, 1 for true. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. /// /// \headerfile <x86intrin.h> /// @@ -1027,7 +1054,9 @@ _mm_comile_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than the corresponding value -/// in the second parameter. The comparison yields 0 for false, 1 for true. +/// in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. /// /// \headerfile <x86intrin.h> /// @@ -1049,8 +1078,9 @@ _mm_comigt_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than or equal to the -/// corresponding value in the second parameter. The comparison yields 0 for -/// false, 1 for true. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. /// /// \headerfile <x86intrin.h> /// @@ -1072,7 +1102,9 @@ _mm_comige_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is unequal to the corresponding value in -/// the second parameter. The comparison yields 0 for false, 1 for true. +/// the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. /// /// \headerfile <x86intrin.h> /// @@ -1093,8 +1125,9 @@ _mm_comineq_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] for equality. The -/// comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 1 is returned. +/// comparison yields 0 for false, 1 for true. +/// +/// If either of the two lower double-precision values is NaN, 1 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1117,8 +1150,10 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than the corresponding value in -/// the second parameter. The comparison yields 0 for false, 1 for true. If -/// either of the two lower double-precision values is NaN, 1 is returned. +/// the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two lower +/// double-precision values is NaN, 1 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1141,9 +1176,10 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than or equal to the -/// corresponding value in the second parameter. The comparison yields 0 for -/// false, 1 for true. If either of the two lower double-precision values is -/// NaN, 1 is returned. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two lower +/// double-precision values is NaN, 1 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1166,8 +1202,10 @@ _mm_ucomile_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than the corresponding value -/// in the second parameter. The comparison yields 0 for false, 1 for true. -/// If either of the two lower double-precision values is NaN, 0 is returned. +/// in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two lower +/// double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1190,9 +1228,10 @@ _mm_ucomigt_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than or equal to the -/// corresponding value in the second parameter. The comparison yields 0 for -/// false, 1 for true. If either of the two lower double-precision values -/// is NaN, 0 is returned. +/// corresponding value in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1215,8 +1254,10 @@ _mm_ucomige_sd(__m128d __a, __m128d __b) /// \brief Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is unequal to the corresponding value in -/// the second parameter. The comparison yields 0 for false, 1 for true. If -/// either of the two lower double-precision values is NaN, 0 is returned. +/// the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two lower +/// double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1278,8 +1319,9 @@ _mm_cvtps_pd(__m128 __a) /// \brief Converts the lower two integer elements of a 128-bit vector of /// [4 x i32] into two double-precision floating-point values, returned in a -/// 128-bit vector of [2 x double]. The upper two elements of the input -/// vector are unused. +/// 128-bit vector of [2 x double]. +/// +/// The upper two elements of the input vector are unused. /// /// \headerfile <x86intrin.h> /// @@ -1287,7 +1329,9 @@ _mm_cvtps_pd(__m128 __a) /// /// \param __a /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are -/// converted to double-precision values. The upper two elements are unused. +/// converted to double-precision values. +/// +/// The upper two elements are unused. /// \returns A 128-bit vector of [2 x double] containing the converted values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) @@ -1409,10 +1453,11 @@ _mm_cvtss_sd(__m128d __a, __m128 __b) /// \brief Converts the two double-precision floating-point elements of a /// 128-bit vector of [2 x double] into two signed 32-bit integer values, -/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the -/// result of either conversion is inexact, the result is truncated (rounded -/// towards zero) regardless of the current MXCSR setting. The upper 64 bits -/// of the result vector are set to zero. +/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. +/// +/// If the result of either conversion is inexact, the result is truncated +/// (rounded towards zero) regardless of the current MXCSR setting. The upper +/// 64 bits of the result vector are set to zero. /// /// \headerfile <x86intrin.h> /// @@ -1466,9 +1511,10 @@ _mm_cvtpd_pi32(__m128d __a) /// \brief Converts the two double-precision floating-point elements of a /// 128-bit vector of [2 x double] into two signed 32-bit integer values, -/// returned in a 64-bit vector of [2 x i32]. If the result of either -/// conversion is inexact, the result is truncated (rounded towards zero) -/// regardless of the current MXCSR setting. +/// returned in a 64-bit vector of [2 x i32]. +/// +/// If the result of either conversion is inexact, the result is truncated +/// (rounded towards zero) regardless of the current MXCSR setting. /// /// \headerfile <x86intrin.h> /// @@ -1599,6 +1645,17 @@ _mm_loadu_pd(double const *__dp) return ((struct __loadu_pd*)__dp)->__v; } +/// \brief Loads a 64-bit integer value to the low element of a 128-bit integer +/// vector and clears the upper element. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. +/// +/// \param __a +/// A pointer to a 64-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [2 x i64] containing the loaded value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) { @@ -1609,6 +1666,17 @@ _mm_loadu_si64(void const *__a) return (__m128i){__u, 0L}; } +/// \brief Loads a 64-bit double-precision value to the low element of a +/// 128-bit integer vector and clears the upper element. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. +/// +/// \param __dp +/// A pointer to a memory location containing a double-precision value. +/// The address of the memory location does not have to be aligned. +/// \returns A 128-bit vector of [2 x double] containing the loaded value. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) { @@ -1728,6 +1796,24 @@ _mm_set1_pd(double __w) return (__m128d){ __w, __w }; } +/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each +/// of the two double-precision floating-point vector elements set to the +/// specified double-precision floating-point value. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_set_pd1(double __w) +{ + return _mm_set1_pd(__w); +} + /// \brief Constructs a 128-bit floating-point vector of [2 x double] /// initialized with the specified double-precision floating-point values. /// @@ -1787,7 +1873,7 @@ _mm_setzero_pd(void) /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower /// 64 bits are set to the lower 64 bits of the second parameter. The upper /// 64 bits are set to the upper 64 bits of the first parameter. -// +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. @@ -1825,12 +1911,38 @@ _mm_store_sd(double *__dp, __m128d __a) ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; } +/// \brief Moves packed double-precision values from a 128-bit vector of +/// [2 x double] to a memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction. +/// +/// \param __dp +/// A pointer to an aligned memory location that can store two +/// double-precision values. +/// \param __a +/// A packed 128-bit vector of [2 x double] containing the values to be +/// moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a) { *(__m128d*)__dp = __a; } +/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to +/// the upper and lower 64 bits of a memory location. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. +/// +/// \param __dp +/// A pointer to a memory location that can store two double-precision +/// values. +/// \param __a +/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each +/// of the values in \a dp. static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a) { @@ -1940,8 +2052,9 @@ _mm_storel_pd(double *__dp, __m128d __a) /// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8], /// saving the lower 8 bits of each sum in the corresponding element of a -/// 128-bit result vector of [16 x i8]. The integer elements of both -/// parameters can be either signed or unsigned. +/// 128-bit result vector of [16 x i8]. +/// +/// The integer elements of both parameters can be either signed or unsigned. /// /// \headerfile <x86intrin.h> /// @@ -1961,8 +2074,9 @@ _mm_add_epi8(__m128i __a, __m128i __b) /// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16], /// saving the lower 16 bits of each sum in the corresponding element of a -/// 128-bit result vector of [8 x i16]. The integer elements of both -/// parameters can be either signed or unsigned. +/// 128-bit result vector of [8 x i16]. +/// +/// The integer elements of both parameters can be either signed or unsigned. /// /// \headerfile <x86intrin.h> /// @@ -1982,8 +2096,9 @@ _mm_add_epi16(__m128i __a, __m128i __b) /// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32], /// saving the lower 32 bits of each sum in the corresponding element of a -/// 128-bit result vector of [4 x i32]. The integer elements of both -/// parameters can be either signed or unsigned. +/// 128-bit result vector of [4 x i32]. +/// +/// The integer elements of both parameters can be either signed or unsigned. /// /// \headerfile <x86intrin.h> /// @@ -2021,8 +2136,9 @@ _mm_add_si64(__m64 __a, __m64 __b) /// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64], /// saving the lower 64 bits of each sum in the corresponding element of a -/// 128-bit result vector of [2 x i64]. The integer elements of both -/// parameters can be either signed or unsigned. +/// 128-bit result vector of [2 x i64]. +/// +/// The integer elements of both parameters can be either signed or unsigned. /// /// \headerfile <x86intrin.h> /// @@ -2168,10 +2284,12 @@ _mm_avg_epu16(__m128i __a, __m128i __b) /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16] /// vectors, producing eight intermediate 32-bit signed integer products, and /// adds the consecutive pairs of 32-bit products to form a 128-bit signed -/// [4 x i32] vector. For example, bits [15:0] of both parameters are -/// multiplied producing a 32-bit product, bits [31:16] of both parameters -/// are multiplied producing a 32-bit product, and the sum of those two -/// products becomes bits [31:0] of the result. +/// [4 x i32] vector. +/// +/// For example, bits [15:0] of both parameters are multiplied producing a +/// 32-bit product, bits [31:16] of both parameters are multiplied producing +/// a 32-bit product, and the sum of those two products becomes bits [31:0] +/// of the result. /// /// \headerfile <x86intrin.h> /// @@ -2369,7 +2487,7 @@ _mm_mul_epu32(__m128i __a, __m128i __b) /// \brief Computes the absolute differences of corresponding 8-bit integer /// values in two 128-bit vectors. Sums the first 8 absolute differences, and -/// separately sums the second 8 absolute differences. Packss these two +/// separately sums the second 8 absolute differences. Packs these two /// unsigned 16-bit integer sums into the upper and lower elements of a /// [2 x i64] vector. /// @@ -3106,8 +3224,9 @@ _mm_cmpgt_epi8(__m128i __a, __m128i __b) /// \brief Compares each of the corresponding signed 16-bit values of the /// 128-bit integer vectors to determine if the values in the first operand -/// are greater than those in the second operand. Each comparison yields 0h -/// for false, FFFFh for true. +/// are greater than those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -3126,8 +3245,9 @@ _mm_cmpgt_epi16(__m128i __a, __m128i __b) /// \brief Compares each of the corresponding signed 32-bit values of the /// 128-bit integer vectors to determine if the values in the first operand -/// are greater than those in the second operand. Each comparison yields 0h -/// for false, FFFFFFFFh for true. +/// are greater than those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -3146,8 +3266,9 @@ _mm_cmpgt_epi32(__m128i __a, __m128i __b) /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit /// integer vectors to determine if the values in the first operand are less -/// than those in the second operand. Each comparison yields 0h for false, -/// FFh for true. +/// than those in the second operand. +/// +/// Each comparison yields 0h for false, FFh for true. /// /// \headerfile <x86intrin.h> /// @@ -3166,8 +3287,9 @@ _mm_cmplt_epi8(__m128i __a, __m128i __b) /// \brief Compares each of the corresponding signed 16-bit values of the /// 128-bit integer vectors to determine if the values in the first operand -/// are less than those in the second operand. Each comparison yields 0h for -/// false, FFFFh for true. +/// are less than those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -3186,8 +3308,9 @@ _mm_cmplt_epi16(__m128i __a, __m128i __b) /// \brief Compares each of the corresponding signed 32-bit values of the /// 128-bit integer vectors to determine if the values in the first operand -/// are less than those in the second operand. Each comparison yields 0h for -/// false, FFFFFFFFh for true. +/// are less than those in the second operand. +/// +/// Each comparison yields 0h for false, FFFFFFFFh for true. /// /// \headerfile <x86intrin.h> /// @@ -3885,10 +4008,11 @@ _mm_storeu_si128(__m128i *__p, __m128i __b) /// \brief Moves bytes selected by the mask from the first operand to the /// specified unaligned memory location. When a mask bit is 1, the -/// corresponding byte is written, otherwise it is not written. To minimize -/// caching, the date is flagged as non-temporal (unlikely to be used again -/// soon). Exception and trap behavior for elements not selected for storage -/// to memory are implementation dependent. +/// corresponding byte is written, otherwise it is not written. +/// +/// To minimize caching, the date is flagged as non-temporal (unlikely to be +/// used again soon). Exception and trap behavior for elements not selected +/// for storage to memory are implementation dependent. /// /// \headerfile <x86intrin.h> /// @@ -3932,8 +4056,10 @@ _mm_storel_epi64(__m128i *__p, __m128i __a) } /// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit -/// aligned memory location. To minimize caching, the data is flagged as -/// non-temporal (unlikely to be used again soon). +/// aligned memory location. +/// +/// To minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). /// /// \headerfile <x86intrin.h> /// @@ -3950,6 +4076,7 @@ _mm_stream_pd(double *__p, __m128d __a) } /// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location. +/// /// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). /// @@ -3967,8 +4094,9 @@ _mm_stream_si128(__m128i *__p, __m128i __a) __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); } -/// \brief Stores a 32-bit integer value in the specified memory location. To -/// minimize caching, the data is flagged as non-temporal (unlikely to be +/// \brief Stores a 32-bit integer value in the specified memory location. +/// +/// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). /// /// \headerfile <x86intrin.h> @@ -3986,8 +4114,9 @@ _mm_stream_si32(int *__p, int __a) } #ifdef __x86_64__ -/// \brief Stores a 64-bit integer value in the specified memory location. To -/// minimize caching, the data is flagged as non-temporal (unlikely to be +/// \brief Stores a 64-bit integer value in the specified memory location. +/// +/// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). /// /// \headerfile <x86intrin.h> @@ -4019,7 +4148,7 @@ extern "C" { /// \param __p /// A pointer to the memory location used to identify the cache line to be /// flushed. -void _mm_clflush(void const *); +void _mm_clflush(void const * __p); /// \brief Forces strong memory ordering (serialization) between load /// instructions preceding this instruction and load instructions following @@ -4141,7 +4270,7 @@ _mm_packus_epi16(__m128i __a, __m128i __b) /// \param __a /// A 128-bit integer vector. /// \param __imm -/// An immediate value. Bits [3:0] selects values from \a __a to be assigned +/// An immediate value. Bits [2:0] selects values from \a __a to be assigned /// to bits[15:0] of the result. \n /// 000: assign values from bits [15:0] of \a __a. \n /// 001: assign values from bits [31:16] of \a __a. \n @@ -4788,4 +4917,12 @@ void _mm_pause(void); #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) +#define _MM_DENORMALS_ZERO_ON (0x0040) +#define _MM_DENORMALS_ZERO_OFF (0x0000) + +#define _MM_DENORMALS_ZERO_MASK (0x0040) + +#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) +#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) + #endif /* __EMMINTRIN_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/f16cintrin.h b/contrib/llvm/tools/clang/lib/Headers/f16cintrin.h index 180712f..b796cc8 100644 --- a/contrib/llvm/tools/clang/lib/Headers/f16cintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/f16cintrin.h @@ -72,9 +72,9 @@ _cvtsh_ss(unsigned short __a) /// 011: Truncate \n /// 1XX: Use MXCSR.RC for rounding /// \returns The converted 16-bit half-precision float value. -#define _cvtss_sh(a, imm) \ - ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \ - (imm)))[0])) +#define _cvtss_sh(a, imm) __extension__ ({ \ + (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \ + (imm)))[0]); }) /// \brief Converts a 128-bit vector containing 32-bit float values into a /// 128-bit vector containing 16-bit half-precision float values. @@ -99,8 +99,8 @@ _cvtsh_ss(unsigned short __a) /// \returns A 128-bit vector containing converted 16-bit half-precision float /// values. The lower 64 bits are used to store the converted 16-bit /// half-precision floating-point values. -#define _mm_cvtps_ph(a, imm) \ - ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))) +#define _mm_cvtps_ph(a, imm) __extension__ ({ \ + (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); }) /// \brief Converts a 128-bit vector containing 16-bit half-precision float /// values into a 128-bit vector containing 32-bit float values. diff --git a/contrib/llvm/tools/clang/lib/Headers/float.h b/contrib/llvm/tools/clang/lib/Headers/float.h index 0f453d8..502143d 100644 --- a/contrib/llvm/tools/clang/lib/Headers/float.h +++ b/contrib/llvm/tools/clang/lib/Headers/float.h @@ -33,6 +33,15 @@ */ #if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \ __STDC_HOSTED__ && __has_include_next(<float.h>) + +/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level + * of #include_next<float.h> to keep Metrowerks compilers happy. Avoid this + * extra indirection. + */ +#ifdef __APPLE__ +#define _FLOAT_H_ +#endif + # include_next <float.h> /* Undefine anything that we'll be redefining below. */ diff --git a/contrib/llvm/tools/clang/lib/Headers/htmxlintrin.h b/contrib/llvm/tools/clang/lib/Headers/htmxlintrin.h index 16dc705..28f7d02 100644 --- a/contrib/llvm/tools/clang/lib/Headers/htmxlintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/htmxlintrin.h @@ -35,14 +35,10 @@ extern "C" { #endif -#define _TEXASR_PTR(TM_BUF) \ - ((texasr_t *)((TM_BUF)+0)) -#define _TEXASRU_PTR(TM_BUF) \ - ((texasru_t *)((TM_BUF)+0)) -#define _TEXASRL_PTR(TM_BUF) \ - ((texasrl_t *)((TM_BUF)+4)) -#define _TFIAR_PTR(TM_BUF) \ - ((tfiar_t *)((TM_BUF)+8)) +#define _TEXASR_PTR(TM_BUF) ((texasr_t *)((char *)(TM_BUF) + 0)) +#define _TEXASRU_PTR(TM_BUF) ((texasru_t *)((char *)(TM_BUF) + 0)) +#define _TEXASRL_PTR(TM_BUF) ((texasrl_t *)((char *)(TM_BUF) + 4)) +#define _TFIAR_PTR(TM_BUF) ((tfiar_t *)((char *)(TM_BUF) + 8)) typedef char TM_buff_type[16]; @@ -178,7 +174,7 @@ extern __inline long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) __TM_is_conflict(void* const __TM_buff) { - texasru_t texasru = *_TEXASRU_PTR (TM_buff); + texasru_t texasru = *_TEXASRU_PTR (__TM_buff); /* Return TEXASR bits 11 (Self-Induced Conflict) through 14 (Translation Invalidation Conflict). */ return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0; diff --git a/contrib/llvm/tools/clang/lib/Headers/immintrin.h b/contrib/llvm/tools/clang/lib/Headers/immintrin.h index 7f91d49..c5f25bf 100644 --- a/contrib/llvm/tools/clang/lib/Headers/immintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/immintrin.h @@ -146,6 +146,10 @@ _mm256_cvtph_ps(__m128i __a) #include <avx512cdintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__) +#include <avx512vpopcntdqintrin.h> +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__) #include <avx512dqintrin.h> #endif @@ -208,6 +212,15 @@ _rdrand32_step(unsigned int *__p) return __builtin_ia32_rdrand32_step(__p); } +#ifdef __x86_64__ +static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) +_rdrand64_step(unsigned long long *__p) +{ + return __builtin_ia32_rdrand64_step(__p); +} +#endif +#endif /* __RDRND__ */ + /* __bit_scan_forward */ static __inline__ int __attribute__((__always_inline__, __nodebug__)) _bit_scan_forward(int __A) { @@ -220,15 +233,6 @@ _bit_scan_reverse(int __A) { return 31 - __builtin_clz(__A); } -#ifdef __x86_64__ -static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) -_rdrand64_step(unsigned long long *__p) -{ - return __builtin_ia32_rdrand64_step(__p); -} -#endif -#endif /* __RDRND__ */ - #if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__) #ifdef __x86_64__ static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) diff --git a/contrib/llvm/tools/clang/lib/Headers/intrin.h b/contrib/llvm/tools/clang/lib/Headers/intrin.h index a35262a..881d05c 100644 --- a/contrib/llvm/tools/clang/lib/Headers/intrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/intrin.h @@ -69,7 +69,6 @@ static __inline__ __int64 __emul(int, int); static __inline__ unsigned __int64 __emulu(unsigned int, unsigned int); -void __cdecl __fastfail(unsigned int); unsigned int __getcallerseflags(void); static __inline__ void __halt(void); @@ -80,16 +79,12 @@ void __incfsdword(unsigned long); void __incfsword(unsigned long); unsigned long __indword(unsigned short); void __indwordstring(unsigned short, unsigned long *, unsigned long); -void __int2c(void); void __invlpg(void *); unsigned short __inword(unsigned short); void __inwordstring(unsigned short, unsigned short *, unsigned long); void __lidt(void *); unsigned __int64 __ll_lshift(unsigned __int64, int); __int64 __ll_rshift(__int64, int); -void __llwpcb(void *); -unsigned char __lwpins32(unsigned int, unsigned int, unsigned int); -void __lwpval32(unsigned int, unsigned int, unsigned int); unsigned int __lzcnt(unsigned int); unsigned short __lzcnt16(unsigned short); static __inline__ @@ -128,7 +123,6 @@ unsigned __int64 __readmsr(unsigned long); unsigned __int64 __readpmc(unsigned long); unsigned long __segmentlimit(unsigned long); void __sidt(void *); -void *__slwpcb(void); static __inline__ void __stosb(unsigned char *, unsigned char, size_t); static __inline__ @@ -142,7 +136,6 @@ void __svm_stgi(void); void __svm_vmload(size_t); void __svm_vmrun(size_t); void __svm_vmsave(size_t); -void __ud2(void); unsigned __int64 __ull_rshift(unsigned __int64, int); void __vmx_off(void); void __vmx_vmptrst(unsigned __int64 *); @@ -176,7 +169,6 @@ void __cdecl _disable(void); void __cdecl _enable(void); long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value); unsigned char _interlockedbittestandreset(long volatile *, long); -static __inline__ unsigned char _interlockedbittestandset(long volatile *, long); long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long); long _InterlockedCompareExchange_HLERelease(long volatile *, long, long); @@ -231,8 +223,6 @@ void __incgsbyte(unsigned long); void __incgsdword(unsigned long); void __incgsqword(unsigned long); void __incgsword(unsigned long); -unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int); -void __lwpval64(unsigned __int64, unsigned int, unsigned int); unsigned __int64 __lzcnt64(unsigned __int64); static __inline__ void __movsq(unsigned long long *, unsigned long long const *, size_t); @@ -372,11 +362,6 @@ _bittestandset(long *_BitBase, long _BitPos) { *_BitBase = *_BitBase | (1 << _BitPos); return _Res; } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_interlockedbittestandset(long volatile *_BitBase, long _BitPos) { - long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST); - return (_PrevVal >> _BitPos) & 1; -} #if defined(__arm__) || defined(__aarch64__) static __inline__ unsigned char __DEFAULT_FN_ATTRS _interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) { @@ -872,48 +857,7 @@ _InterlockedCompareExchange64_rel(__int64 volatile *_Destination, return _Comparand; } #endif -/*----------------------------------------------------------------------------*\ -|* readfs, readgs -|* (Pointers in address space #256 and #257 are relative to the GS and FS -|* segment registers, respectively.) -\*----------------------------------------------------------------------------*/ -#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset) \ - ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \ - (__offset)) -#ifdef __i386__ -static __inline__ unsigned char __DEFAULT_FN_ATTRS -__readfsbyte(unsigned long __offset) { - return *__ptr_to_addr_space(257, unsigned char, __offset); -} -static __inline__ unsigned short __DEFAULT_FN_ATTRS -__readfsword(unsigned long __offset) { - return *__ptr_to_addr_space(257, unsigned short, __offset); -} -static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS -__readfsqword(unsigned long __offset) { - return *__ptr_to_addr_space(257, unsigned __int64, __offset); -} -#endif -#ifdef __x86_64__ -static __inline__ unsigned char __DEFAULT_FN_ATTRS -__readgsbyte(unsigned long __offset) { - return *__ptr_to_addr_space(256, unsigned char, __offset); -} -static __inline__ unsigned short __DEFAULT_FN_ATTRS -__readgsword(unsigned long __offset) { - return *__ptr_to_addr_space(256, unsigned short, __offset); -} -static __inline__ unsigned long __DEFAULT_FN_ATTRS -__readgsdword(unsigned long __offset) { - return *__ptr_to_addr_space(256, unsigned long, __offset); -} -static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS -__readgsqword(unsigned long __offset) { - return *__ptr_to_addr_space(256, unsigned __int64, __offset); -} -#endif -#undef __ptr_to_addr_space /*----------------------------------------------------------------------------*\ |* movs, stos \*----------------------------------------------------------------------------*/ diff --git a/contrib/llvm/tools/clang/lib/Headers/lwpintrin.h b/contrib/llvm/tools/clang/lib/Headers/lwpintrin.h new file mode 100644 index 0000000..c95fdd9 --- /dev/null +++ b/contrib/llvm/tools/clang/lib/Headers/lwpintrin.h @@ -0,0 +1,150 @@ +/*===---- lwpintrin.h - LWP intrinsics -------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86INTRIN_H +#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef __LWPINTRIN_H +#define __LWPINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp"))) + +/// \brief Parses the LWPCB at the specified address and enables +/// profiling if valid. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> LLWPCB </c> instruction. +/// +/// \param __addr +/// Address to the new Lightweight Profiling Control Block (LWPCB). If the +/// LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables +/// Lightweight Profiling. +static __inline__ void __DEFAULT_FN_ATTRS +__llwpcb (void *__addr) +{ + __builtin_ia32_llwpcb(__addr); +} + +/// \brief Flushes the LWP state to memory and returns the address of the LWPCB. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> SLWPCB </c> instruction. +/// +/// \return +/// Address to the current Lightweight Profiling Control Block (LWPCB). +/// If LWP is not currently enabled, returns NULL. +static __inline__ void* __DEFAULT_FN_ATTRS +__slwpcb () +{ + return __builtin_ia32_slwpcb(); +} + +/// \brief Inserts programmed event record into the LWP event ring buffer +/// and advances the ring buffer pointer. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> LWPINS </c> instruction. +/// +/// \param DATA2 +/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field. +/// \param DATA1 +/// A 32-bit value is inserted into the 32-bit Data1 field. +/// \param FLAGS +/// A 32-bit immediate value is inserted into the 32-bit Flags field. +/// \returns If the ring buffer is full and LWP is running in Synchronized Mode, +/// the event record overwrites the last record in the buffer, the MissedEvents +/// counter in the LWPCB is incremented, the head pointer is not advanced, and +/// 1 is returned. Otherwise 0 is returned. +#define __lwpins32(DATA2, DATA1, FLAGS) \ + (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \ + (unsigned int) (FLAGS))) + +/// \brief Decrements the LWP programmed value sample event counter. If the result is +/// negative, inserts an event record into the LWP event ring buffer in memory +/// and advances the ring buffer pointer. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> LWPVAL </c> instruction. +/// +/// \param DATA2 +/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field. +/// \param DATA1 +/// A 32-bit value is inserted into the 32-bit Data1 field. +/// \param FLAGS +/// A 32-bit immediate value is inserted into the 32-bit Flags field. +#define __lwpval32(DATA2, DATA1, FLAGS) \ + (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \ + (unsigned int) (FLAGS))) + +#ifdef __x86_64__ + +/// \brief Inserts programmed event record into the LWP event ring buffer +/// and advances the ring buffer pointer. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> LWPINS </c> instruction. +/// +/// \param DATA2 +/// A 64-bit value is inserted into the 64-bit Data2 field. +/// \param DATA1 +/// A 32-bit value is inserted into the 32-bit Data1 field. +/// \param FLAGS +/// A 32-bit immediate value is inserted into the 32-bit Flags field. +/// \returns If the ring buffer is full and LWP is running in Synchronized Mode, +/// the event record overwrites the last record in the buffer, the MissedEvents +/// counter in the LWPCB is incremented, the head pointer is not advanced, and +/// 1 is returned. Otherwise 0 is returned. +#define __lwpins64(DATA2, DATA1, FLAGS) \ + (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \ + (unsigned int) (FLAGS))) + +/// \brief Decrements the LWP programmed value sample event counter. If the result is +/// negative, inserts an event record into the LWP event ring buffer in memory +/// and advances the ring buffer pointer. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> LWPVAL </c> instruction. +/// +/// \param DATA2 +/// A 64-bit value is and inserted into the 64-bit Data2 field. +/// \param DATA1 +/// A 32-bit value is inserted into the 32-bit Data1 field. +/// \param FLAGS +/// A 32-bit immediate value is inserted into the 32-bit Flags field. +#define __lwpval64(DATA2, DATA1, FLAGS) \ + (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \ + (unsigned int) (FLAGS))) + +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif /* __LWPINTRIN_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/mmintrin.h b/contrib/llvm/tools/clang/lib/Headers/mmintrin.h index e0c277a..4b38d51 100644 --- a/contrib/llvm/tools/clang/lib/Headers/mmintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/mmintrin.h @@ -211,7 +211,7 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2) /// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction. /// /// \param __m1 -/// A 64-bit integer vector of [8 x i8]. \n +/// A 64-bit integer vector of [8 x i8]. \n /// Bits [39:32] are written to bits [7:0] of the result. \n /// Bits [47:40] are written to bits [23:16] of the result. \n /// Bits [55:48] are written to bits [39:32] of the result. \n @@ -608,10 +608,11 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2) /// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit /// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer -/// element of the first 64-bit integer vector of [8 x i8]. If an element of -/// the first vector is less than the corresponding element of the second -/// vector, the result is saturated to 0. The results are packed into a -/// 64-bit integer vector of [8 x i8]. +/// element of the first 64-bit integer vector of [8 x i8]. +/// +/// If an element of the first vector is less than the corresponding element +/// of the second vector, the result is saturated to 0. The results are +/// packed into a 64-bit integer vector of [8 x i8]. /// /// \headerfile <x86intrin.h> /// @@ -631,10 +632,11 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2) /// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit /// integer vector of [4 x i16] from the corresponding 16-bit unsigned -/// integer element of the first 64-bit integer vector of [4 x i16]. If an -/// element of the first vector is less than the corresponding element of the -/// second vector, the result is saturated to 0. The results are packed into -/// a 64-bit integer vector of [4 x i16]. +/// integer element of the first 64-bit integer vector of [4 x i16]. +/// +/// If an element of the first vector is less than the corresponding element +/// of the second vector, the result is saturated to 0. The results are +/// packed into a 64-bit integer vector of [4 x i16]. /// /// \headerfile <x86intrin.h> /// @@ -657,9 +659,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) /// element of the second 64-bit integer vector of [4 x i16] and get four /// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums. /// The lower 32 bits of these two sums are packed into a 64-bit integer -/// vector of [2 x i32]. For example, bits [15:0] of both parameters are -/// multiplied, bits [31:16] of both parameters are multiplied, and the sum -/// of both results is written to bits [31:0] of the result. +/// vector of [2 x i32]. +/// +/// For example, bits [15:0] of both parameters are multiplied, bits [31:16] +/// of both parameters are multiplied, and the sum of both results is written +/// to bits [31:0] of the result. /// /// \headerfile <x86intrin.h> /// @@ -851,10 +855,11 @@ _mm_slli_si64(__m64 __m, int __count) /// \brief Right-shifts each 16-bit integer element of the first parameter, /// which is a 64-bit integer vector of [4 x i16], by the number of bits -/// specified by the second parameter, which is a 64-bit integer. High-order -/// bits are filled with the sign bit of the initial value of each 16-bit -/// element. The 16-bit results are packed into a 64-bit integer vector of -/// [4 x i16]. +/// specified by the second parameter, which is a 64-bit integer. +/// +/// High-order bits are filled with the sign bit of the initial value of each +/// 16-bit element. The 16-bit results are packed into a 64-bit integer +/// vector of [4 x i16]. /// /// \headerfile <x86intrin.h> /// @@ -874,6 +879,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count) /// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector /// of [4 x i16] by the number of bits specified by a 32-bit integer. +/// /// High-order bits are filled with the sign bit of the initial value of each /// 16-bit element. The 16-bit results are packed into a 64-bit integer /// vector of [4 x i16]. @@ -896,10 +902,11 @@ _mm_srai_pi16(__m64 __m, int __count) /// \brief Right-shifts each 32-bit integer element of the first parameter, /// which is a 64-bit integer vector of [2 x i32], by the number of bits -/// specified by the second parameter, which is a 64-bit integer. High-order -/// bits are filled with the sign bit of the initial value of each 32-bit -/// element. The 32-bit results are packed into a 64-bit integer vector of -/// [2 x i32]. +/// specified by the second parameter, which is a 64-bit integer. +/// +/// High-order bits are filled with the sign bit of the initial value of each +/// 32-bit element. The 32-bit results are packed into a 64-bit integer +/// vector of [2 x i32]. /// /// \headerfile <x86intrin.h> /// @@ -919,6 +926,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count) /// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector /// of [2 x i32] by the number of bits specified by a 32-bit integer. +/// /// High-order bits are filled with the sign bit of the initial value of each /// 32-bit element. The 32-bit results are packed into a 64-bit integer /// vector of [2 x i32]. @@ -941,9 +949,10 @@ _mm_srai_pi32(__m64 __m, int __count) /// \brief Right-shifts each 16-bit integer element of the first parameter, /// which is a 64-bit integer vector of [4 x i16], by the number of bits -/// specified by the second parameter, which is a 64-bit integer. High-order -/// bits are cleared. The 16-bit results are packed into a 64-bit integer -/// vector of [4 x i16]. +/// specified by the second parameter, which is a 64-bit integer. +/// +/// High-order bits are cleared. The 16-bit results are packed into a 64-bit +/// integer vector of [4 x i16]. /// /// \headerfile <x86intrin.h> /// @@ -963,6 +972,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count) /// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector /// of [4 x i16] by the number of bits specified by a 32-bit integer. +/// /// High-order bits are cleared. The 16-bit results are packed into a 64-bit /// integer vector of [4 x i16]. /// @@ -984,9 +994,10 @@ _mm_srli_pi16(__m64 __m, int __count) /// \brief Right-shifts each 32-bit integer element of the first parameter, /// which is a 64-bit integer vector of [2 x i32], by the number of bits -/// specified by the second parameter, which is a 64-bit integer. High-order -/// bits are cleared. The 32-bit results are packed into a 64-bit integer -/// vector of [2 x i32]. +/// specified by the second parameter, which is a 64-bit integer. +/// +/// High-order bits are cleared. The 32-bit results are packed into a 64-bit +/// integer vector of [2 x i32]. /// /// \headerfile <x86intrin.h> /// @@ -1006,6 +1017,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count) /// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector /// of [2 x i32] by the number of bits specified by a 32-bit integer. +/// /// High-order bits are cleared. The 32-bit results are packed into a 64-bit /// integer vector of [2 x i32]. /// @@ -1026,8 +1038,9 @@ _mm_srli_pi32(__m64 __m, int __count) } /// \brief Right-shifts the first 64-bit integer parameter by the number of bits -/// specified by the second 64-bit integer parameter. High-order bits are -/// cleared. +/// specified by the second 64-bit integer parameter. +/// +/// High-order bits are cleared. /// /// \headerfile <x86intrin.h> /// @@ -1046,7 +1059,9 @@ _mm_srl_si64(__m64 __m, __m64 __count) /// \brief Right-shifts the first parameter, which is a 64-bit integer, by the /// number of bits specified by the second parameter, which is a 32-bit -/// integer. High-order bits are cleared. +/// integer. +/// +/// High-order bits are cleared. /// /// \headerfile <x86intrin.h> /// @@ -1140,8 +1155,9 @@ _mm_xor_si64(__m64 __m1, __m64 __m2) /// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of /// [8 x i8] to determine if the element of the first vector is equal to the -/// corresponding element of the second vector. The comparison yields 0 for -/// false, 0xFF for true. +/// corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFF for true. /// /// \headerfile <x86intrin.h> /// @@ -1161,8 +1177,9 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) /// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of /// [4 x i16] to determine if the element of the first vector is equal to the -/// corresponding element of the second vector. The comparison yields 0 for -/// false, 0xFFFF for true. +/// corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -1182,8 +1199,9 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) /// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of /// [2 x i32] to determine if the element of the first vector is equal to the -/// corresponding element of the second vector. The comparison yields 0 for -/// false, 0xFFFFFFFF for true. +/// corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -1203,8 +1221,9 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) /// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of /// [8 x i8] to determine if the element of the first vector is greater than -/// the corresponding element of the second vector. The comparison yields 0 -/// for false, 0xFF for true. +/// the corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFF for true. /// /// \headerfile <x86intrin.h> /// @@ -1224,8 +1243,9 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) /// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of /// [4 x i16] to determine if the element of the first vector is greater than -/// the corresponding element of the second vector. The comparison yields 0 -/// for false, 0xFFFF for true. +/// the corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -1245,8 +1265,9 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) /// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of /// [2 x i32] to determine if the element of the first vector is greater than -/// the corresponding element of the second vector. The comparison yields 0 -/// for false, 0xFFFFFFFF for true. +/// the corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -1268,7 +1289,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction. +/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. /// /// \returns An initialized 64-bit integer vector with all elements set to zero. static __inline__ __m64 __DEFAULT_FN_ATTRS diff --git a/contrib/llvm/tools/clang/lib/Headers/module.modulemap b/contrib/llvm/tools/clang/lib/Headers/module.modulemap index 11ef2f9..95d26ce 100644 --- a/contrib/llvm/tools/clang/lib/Headers/module.modulemap +++ b/contrib/llvm/tools/clang/lib/Headers/module.modulemap @@ -61,6 +61,7 @@ module _Builtin_intrinsics [system] [extern_c] { textual header "xopintrin.h" textual header "fma4intrin.h" textual header "mwaitxintrin.h" + textual header "clzerointrin.h" explicit module mm_malloc { requires !freestanding diff --git a/contrib/llvm/tools/clang/lib/Headers/opencl-c.h b/contrib/llvm/tools/clang/lib/Headers/opencl-c.h index 0c25d312..58c8daf 100644 --- a/contrib/llvm/tools/clang/lib/Headers/opencl-c.h +++ b/contrib/llvm/tools/clang/lib/Headers/opencl-c.h @@ -16,6 +16,12 @@ #endif //cl_khr_depth_images #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 +#if __OPENCL_C_VERSION__ < CL_VERSION_2_0 +#ifdef cl_khr_3d_image_writes +#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable +#endif //cl_khr_3d_image_writes +#endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0 + #define __ovld __attribute__((overloadable)) #define __conv __attribute__((convergent)) @@ -6578,777 +6584,85 @@ half16 __ovld __cnfn convert_half16_rtz(double16); * OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators * Reinterprets a data type as another data type of the same size */ -char __ovld __cnfn as_char(char); -char __ovld __cnfn as_char(uchar); - -char2 __ovld __cnfn as_char2(char2); -char2 __ovld __cnfn as_char2(uchar2); -char2 __ovld __cnfn as_char2(short); -char2 __ovld __cnfn as_char2(ushort); - -char3 __ovld __cnfn as_char3(char3); -char3 __ovld __cnfn as_char3(char4); -char3 __ovld __cnfn as_char3(uchar3); -char3 __ovld __cnfn as_char3(uchar4); -char3 __ovld __cnfn as_char3(short2); -char3 __ovld __cnfn as_char3(ushort2); -char3 __ovld __cnfn as_char3(int); -char3 __ovld __cnfn as_char3(uint); -char3 __ovld __cnfn as_char3(float); - -char4 __ovld __cnfn as_char4(char3); -char4 __ovld __cnfn as_char4(char4); -char4 __ovld __cnfn as_char4(uchar3); -char4 __ovld __cnfn as_char4(uchar4); -char4 __ovld __cnfn as_char4(short2); -char4 __ovld __cnfn as_char4(ushort2); -char4 __ovld __cnfn as_char4(int); -char4 __ovld __cnfn as_char4(uint); -char4 __ovld __cnfn as_char4(float); - -char8 __ovld __cnfn as_char8(char8); -char8 __ovld __cnfn as_char8(uchar8); -char8 __ovld __cnfn as_char8(short3); -char8 __ovld __cnfn as_char8(short4); -char8 __ovld __cnfn as_char8(ushort3); -char8 __ovld __cnfn as_char8(ushort4); -char8 __ovld __cnfn as_char8(int2); -char8 __ovld __cnfn as_char8(uint2); -char8 __ovld __cnfn as_char8(long); -char8 __ovld __cnfn as_char8(ulong); -char8 __ovld __cnfn as_char8(float2); - -char16 __ovld __cnfn as_char16(char16); -char16 __ovld __cnfn as_char16(uchar16); -char16 __ovld __cnfn as_char16(short8); -char16 __ovld __cnfn as_char16(ushort8); -char16 __ovld __cnfn as_char16(int3); -char16 __ovld __cnfn as_char16(int4); -char16 __ovld __cnfn as_char16(uint3); -char16 __ovld __cnfn as_char16(uint4); -char16 __ovld __cnfn as_char16(long2); -char16 __ovld __cnfn as_char16(ulong2); -char16 __ovld __cnfn as_char16(float3); -char16 __ovld __cnfn as_char16(float4); - -uchar __ovld __cnfn as_uchar(char); -uchar __ovld __cnfn as_uchar(uchar); - -uchar2 __ovld __cnfn as_uchar2(char2); -uchar2 __ovld __cnfn as_uchar2(uchar2); -uchar2 __ovld __cnfn as_uchar2(short); -uchar2 __ovld __cnfn as_uchar2(ushort); - -uchar3 __ovld __cnfn as_uchar3(char3); -uchar3 __ovld __cnfn as_uchar3(char4); -uchar3 __ovld __cnfn as_uchar3(uchar3); -uchar3 __ovld __cnfn as_uchar3(uchar4); -uchar3 __ovld __cnfn as_uchar3(short2); -uchar3 __ovld __cnfn as_uchar3(ushort2); -uchar3 __ovld __cnfn as_uchar3(int); -uchar3 __ovld __cnfn as_uchar3(uint); -uchar3 __ovld __cnfn as_uchar3(float); - -uchar4 __ovld __cnfn as_uchar4(char3); -uchar4 __ovld __cnfn as_uchar4(char4); -uchar4 __ovld __cnfn as_uchar4(uchar3); -uchar4 __ovld __cnfn as_uchar4(uchar4); -uchar4 __ovld __cnfn as_uchar4(short2); -uchar4 __ovld __cnfn as_uchar4(ushort2); -uchar4 __ovld __cnfn as_uchar4(int); -uchar4 __ovld __cnfn as_uchar4(uint); -uchar4 __ovld __cnfn as_uchar4(float); - -uchar8 __ovld __cnfn as_uchar8(char8); -uchar8 __ovld __cnfn as_uchar8(uchar8); -uchar8 __ovld __cnfn as_uchar8(short3); -uchar8 __ovld __cnfn as_uchar8(short4); -uchar8 __ovld __cnfn as_uchar8(ushort3); -uchar8 __ovld __cnfn as_uchar8(ushort4); -uchar8 __ovld __cnfn as_uchar8(int2); -uchar8 __ovld __cnfn as_uchar8(uint2); -uchar8 __ovld __cnfn as_uchar8(long); -uchar8 __ovld __cnfn as_uchar8(ulong); -uchar8 __ovld __cnfn as_uchar8(float2); - -uchar16 __ovld __cnfn as_uchar16(char16); -uchar16 __ovld __cnfn as_uchar16(uchar16); -uchar16 __ovld __cnfn as_uchar16(short8); -uchar16 __ovld __cnfn as_uchar16(ushort8); -uchar16 __ovld __cnfn as_uchar16(int3); -uchar16 __ovld __cnfn as_uchar16(int4); -uchar16 __ovld __cnfn as_uchar16(uint3); -uchar16 __ovld __cnfn as_uchar16(uint4); -uchar16 __ovld __cnfn as_uchar16(long2); -uchar16 __ovld __cnfn as_uchar16(ulong2); -uchar16 __ovld __cnfn as_uchar16(float3); -uchar16 __ovld __cnfn as_uchar16(float4); - -short __ovld __cnfn as_short(char2); -short __ovld __cnfn as_short(uchar2); -short __ovld __cnfn as_short(short); -short __ovld __cnfn as_short(ushort); - -short2 __ovld __cnfn as_short2(char3); -short2 __ovld __cnfn as_short2(char4); -short2 __ovld __cnfn as_short2(uchar3); -short2 __ovld __cnfn as_short2(uchar4); -short2 __ovld __cnfn as_short2(short2); -short2 __ovld __cnfn as_short2(ushort2); -short2 __ovld __cnfn as_short2(int); -short2 __ovld __cnfn as_short2(uint); -short2 __ovld __cnfn as_short2(float); - -short3 __ovld __cnfn as_short3(char8); -short3 __ovld __cnfn as_short3(uchar8); -short3 __ovld __cnfn as_short3(short3); -short3 __ovld __cnfn as_short3(short4); -short3 __ovld __cnfn as_short3(ushort3); -short3 __ovld __cnfn as_short3(ushort4); -short3 __ovld __cnfn as_short3(int2); -short3 __ovld __cnfn as_short3(uint2); -short3 __ovld __cnfn as_short3(long); -short3 __ovld __cnfn as_short3(ulong); -short3 __ovld __cnfn as_short3(float2); - -short4 __ovld __cnfn as_short4(char8); -short4 __ovld __cnfn as_short4(uchar8); -short4 __ovld __cnfn as_short4(short3); -short4 __ovld __cnfn as_short4(short4); -short4 __ovld __cnfn as_short4(ushort3); -short4 __ovld __cnfn as_short4(ushort4); -short4 __ovld __cnfn as_short4(int2); -short4 __ovld __cnfn as_short4(uint2); -short4 __ovld __cnfn as_short4(long); -short4 __ovld __cnfn as_short4(ulong); -short4 __ovld __cnfn as_short4(float2); - -short8 __ovld __cnfn as_short8(char16); -short8 __ovld __cnfn as_short8(uchar16); -short8 __ovld __cnfn as_short8(short8); -short8 __ovld __cnfn as_short8(ushort8); -short8 __ovld __cnfn as_short8(int3); -short8 __ovld __cnfn as_short8(int4); -short8 __ovld __cnfn as_short8(uint3); -short8 __ovld __cnfn as_short8(uint4); -short8 __ovld __cnfn as_short8(long2); -short8 __ovld __cnfn as_short8(ulong2); -short8 __ovld __cnfn as_short8(float3); -short8 __ovld __cnfn as_short8(float4); - -short16 __ovld __cnfn as_short16(short16); -short16 __ovld __cnfn as_short16(ushort16); -short16 __ovld __cnfn as_short16(int8); -short16 __ovld __cnfn as_short16(uint8); -short16 __ovld __cnfn as_short16(long3); -short16 __ovld __cnfn as_short16(long4); -short16 __ovld __cnfn as_short16(ulong3); -short16 __ovld __cnfn as_short16(ulong4); -short16 __ovld __cnfn as_short16(float8); - -ushort __ovld __cnfn as_ushort(char2); -ushort __ovld __cnfn as_ushort(uchar2); -ushort __ovld __cnfn as_ushort(short); -ushort __ovld __cnfn as_ushort(ushort); - -ushort2 __ovld __cnfn as_ushort2(char3); -ushort2 __ovld __cnfn as_ushort2(char4); -ushort2 __ovld __cnfn as_ushort2(uchar3); -ushort2 __ovld __cnfn as_ushort2(uchar4); -ushort2 __ovld __cnfn as_ushort2(short2); -ushort2 __ovld __cnfn as_ushort2(ushort2); -ushort2 __ovld __cnfn as_ushort2(int); -ushort2 __ovld __cnfn as_ushort2(uint); -ushort2 __ovld __cnfn as_ushort2(float); - -ushort3 __ovld __cnfn as_ushort3(char8); -ushort3 __ovld __cnfn as_ushort3(uchar8); -ushort3 __ovld __cnfn as_ushort3(short3); -ushort3 __ovld __cnfn as_ushort3(short4); -ushort3 __ovld __cnfn as_ushort3(ushort3); -ushort3 __ovld __cnfn as_ushort3(ushort4); -ushort3 __ovld __cnfn as_ushort3(int2); -ushort3 __ovld __cnfn as_ushort3(uint2); -ushort3 __ovld __cnfn as_ushort3(long); -ushort3 __ovld __cnfn as_ushort3(ulong); -ushort3 __ovld __cnfn as_ushort3(float2); - -ushort4 __ovld __cnfn as_ushort4(char8); -ushort4 __ovld __cnfn as_ushort4(uchar8); -ushort4 __ovld __cnfn as_ushort4(short3); -ushort4 __ovld __cnfn as_ushort4(short4); -ushort4 __ovld __cnfn as_ushort4(ushort3); -ushort4 __ovld __cnfn as_ushort4(ushort4); -ushort4 __ovld __cnfn as_ushort4(int2); -ushort4 __ovld __cnfn as_ushort4(uint2); -ushort4 __ovld __cnfn as_ushort4(long); -ushort4 __ovld __cnfn as_ushort4(ulong); -ushort4 __ovld __cnfn as_ushort4(float2); - -ushort8 __ovld __cnfn as_ushort8(char16); -ushort8 __ovld __cnfn as_ushort8(uchar16); -ushort8 __ovld __cnfn as_ushort8(short8); -ushort8 __ovld __cnfn as_ushort8(ushort8); -ushort8 __ovld __cnfn as_ushort8(int3); -ushort8 __ovld __cnfn as_ushort8(int4); -ushort8 __ovld __cnfn as_ushort8(uint3); -ushort8 __ovld __cnfn as_ushort8(uint4); -ushort8 __ovld __cnfn as_ushort8(long2); -ushort8 __ovld __cnfn as_ushort8(ulong2); -ushort8 __ovld __cnfn as_ushort8(float3); -ushort8 __ovld __cnfn as_ushort8(float4); - -ushort16 __ovld __cnfn as_ushort16(short16); -ushort16 __ovld __cnfn as_ushort16(ushort16); -ushort16 __ovld __cnfn as_ushort16(int8); -ushort16 __ovld __cnfn as_ushort16(uint8); -ushort16 __ovld __cnfn as_ushort16(long3); -ushort16 __ovld __cnfn as_ushort16(long4); -ushort16 __ovld __cnfn as_ushort16(ulong3); -ushort16 __ovld __cnfn as_ushort16(ulong4); -ushort16 __ovld __cnfn as_ushort16(float8); - -int __ovld __cnfn as_int(char3); -int __ovld __cnfn as_int(char4); -int __ovld __cnfn as_int(uchar3); -int __ovld __cnfn as_int(uchar4); -int __ovld __cnfn as_int(short2); -int __ovld __cnfn as_int(ushort2); -int __ovld __cnfn as_int(int); -int __ovld __cnfn as_int(uint); -int __ovld __cnfn as_int(float); - -int2 __ovld __cnfn as_int2(char8); -int2 __ovld __cnfn as_int2(uchar8); -int2 __ovld __cnfn as_int2(short3); -int2 __ovld __cnfn as_int2(short4); -int2 __ovld __cnfn as_int2(ushort3); -int2 __ovld __cnfn as_int2(ushort4); -int2 __ovld __cnfn as_int2(int2); -int2 __ovld __cnfn as_int2(uint2); -int2 __ovld __cnfn as_int2(long); -int2 __ovld __cnfn as_int2(ulong); -int2 __ovld __cnfn as_int2(float2); - -int3 __ovld __cnfn as_int3(char16); -int3 __ovld __cnfn as_int3(uchar16); -int3 __ovld __cnfn as_int3(short8); -int3 __ovld __cnfn as_int3(ushort8); -int3 __ovld __cnfn as_int3(int3); -int3 __ovld __cnfn as_int3(int4); -int3 __ovld __cnfn as_int3(uint3); -int3 __ovld __cnfn as_int3(uint4); -int3 __ovld __cnfn as_int3(long2); -int3 __ovld __cnfn as_int3(ulong2); -int3 __ovld __cnfn as_int3(float3); -int3 __ovld __cnfn as_int3(float4); - -int4 __ovld __cnfn as_int4(char16); -int4 __ovld __cnfn as_int4(uchar16); -int4 __ovld __cnfn as_int4(short8); -int4 __ovld __cnfn as_int4(ushort8); -int4 __ovld __cnfn as_int4(int3); -int4 __ovld __cnfn as_int4(int4); -int4 __ovld __cnfn as_int4(uint3); -int4 __ovld __cnfn as_int4(uint4); -int4 __ovld __cnfn as_int4(long2); -int4 __ovld __cnfn as_int4(ulong2); -int4 __ovld __cnfn as_int4(float3); -int4 __ovld __cnfn as_int4(float4); - -int8 __ovld __cnfn as_int8(short16); -int8 __ovld __cnfn as_int8(ushort16); -int8 __ovld __cnfn as_int8(int8); -int8 __ovld __cnfn as_int8(uint8); -int8 __ovld __cnfn as_int8(long3); -int8 __ovld __cnfn as_int8(long4); -int8 __ovld __cnfn as_int8(ulong3); -int8 __ovld __cnfn as_int8(ulong4); -int8 __ovld __cnfn as_int8(float8); - -int16 __ovld __cnfn as_int16(int16); -int16 __ovld __cnfn as_int16(uint16); -int16 __ovld __cnfn as_int16(long8); -int16 __ovld __cnfn as_int16(ulong8); -int16 __ovld __cnfn as_int16(float16); - -uint __ovld __cnfn as_uint(char3); -uint __ovld __cnfn as_uint(char4); -uint __ovld __cnfn as_uint(uchar3); -uint __ovld __cnfn as_uint(uchar4); -uint __ovld __cnfn as_uint(short2); -uint __ovld __cnfn as_uint(ushort2); -uint __ovld __cnfn as_uint(int); -uint __ovld __cnfn as_uint(uint); -uint __ovld __cnfn as_uint(float); - -uint2 __ovld __cnfn as_uint2(char8); -uint2 __ovld __cnfn as_uint2(uchar8); -uint2 __ovld __cnfn as_uint2(short3); -uint2 __ovld __cnfn as_uint2(short4); -uint2 __ovld __cnfn as_uint2(ushort3); -uint2 __ovld __cnfn as_uint2(ushort4); -uint2 __ovld __cnfn as_uint2(int2); -uint2 __ovld __cnfn as_uint2(uint2); -uint2 __ovld __cnfn as_uint2(long); -uint2 __ovld __cnfn as_uint2(ulong); -uint2 __ovld __cnfn as_uint2(float2); - -uint3 __ovld __cnfn as_uint3(char16); -uint3 __ovld __cnfn as_uint3(uchar16); -uint3 __ovld __cnfn as_uint3(short8); -uint3 __ovld __cnfn as_uint3(ushort8); -uint3 __ovld __cnfn as_uint3(int3); -uint3 __ovld __cnfn as_uint3(int4); -uint3 __ovld __cnfn as_uint3(uint3); -uint3 __ovld __cnfn as_uint3(uint4); -uint3 __ovld __cnfn as_uint3(long2); -uint3 __ovld __cnfn as_uint3(ulong2); -uint3 __ovld __cnfn as_uint3(float3); -uint3 __ovld __cnfn as_uint3(float4); - -uint4 __ovld __cnfn as_uint4(char16); -uint4 __ovld __cnfn as_uint4(uchar16); -uint4 __ovld __cnfn as_uint4(short8); -uint4 __ovld __cnfn as_uint4(ushort8); -uint4 __ovld __cnfn as_uint4(int3); -uint4 __ovld __cnfn as_uint4(int4); -uint4 __ovld __cnfn as_uint4(uint3); -uint4 __ovld __cnfn as_uint4(uint4); -uint4 __ovld __cnfn as_uint4(long2); -uint4 __ovld __cnfn as_uint4(ulong2); -uint4 __ovld __cnfn as_uint4(float3); -uint4 __ovld __cnfn as_uint4(float4); - -uint8 __ovld __cnfn as_uint8(short16); -uint8 __ovld __cnfn as_uint8(ushort16); -uint8 __ovld __cnfn as_uint8(int8); -uint8 __ovld __cnfn as_uint8(uint8); -uint8 __ovld __cnfn as_uint8(long3); -uint8 __ovld __cnfn as_uint8(long4); -uint8 __ovld __cnfn as_uint8(ulong3); -uint8 __ovld __cnfn as_uint8(ulong4); -uint8 __ovld __cnfn as_uint8(float8); - -uint16 __ovld __cnfn as_uint16(int16); -uint16 __ovld __cnfn as_uint16(uint16); -uint16 __ovld __cnfn as_uint16(long8); -uint16 __ovld __cnfn as_uint16(ulong8); -uint16 __ovld __cnfn as_uint16(float16); - -long __ovld __cnfn as_long(char8); -long __ovld __cnfn as_long(uchar8); -long __ovld __cnfn as_long(short3); -long __ovld __cnfn as_long(short4); -long __ovld __cnfn as_long(ushort3); -long __ovld __cnfn as_long(ushort4); -long __ovld __cnfn as_long(int2); -long __ovld __cnfn as_long(uint2); -long __ovld __cnfn as_long(long); -long __ovld __cnfn as_long(ulong); -long __ovld __cnfn as_long(float2); - -long2 __ovld __cnfn as_long2(char16); -long2 __ovld __cnfn as_long2(uchar16); -long2 __ovld __cnfn as_long2(short8); -long2 __ovld __cnfn as_long2(ushort8); -long2 __ovld __cnfn as_long2(int3); -long2 __ovld __cnfn as_long2(int4); -long2 __ovld __cnfn as_long2(uint3); -long2 __ovld __cnfn as_long2(uint4); -long2 __ovld __cnfn as_long2(long2); -long2 __ovld __cnfn as_long2(ulong2); -long2 __ovld __cnfn as_long2(float3); -long2 __ovld __cnfn as_long2(float4); - -long3 __ovld __cnfn as_long3(short16); -long3 __ovld __cnfn as_long3(ushort16); -long3 __ovld __cnfn as_long3(int8); -long3 __ovld __cnfn as_long3(uint8); -long3 __ovld __cnfn as_long3(long3); -long3 __ovld __cnfn as_long3(long4); -long3 __ovld __cnfn as_long3(ulong3); -long3 __ovld __cnfn as_long3(ulong4); -long3 __ovld __cnfn as_long3(float8); - -long4 __ovld __cnfn as_long4(short16); -long4 __ovld __cnfn as_long4(ushort16); -long4 __ovld __cnfn as_long4(int8); -long4 __ovld __cnfn as_long4(uint8); -long4 __ovld __cnfn as_long4(long3); -long4 __ovld __cnfn as_long4(long4); -long4 __ovld __cnfn as_long4(ulong3); -long4 __ovld __cnfn as_long4(ulong4); -long4 __ovld __cnfn as_long4(float8); - -long8 __ovld __cnfn as_long8(int16); -long8 __ovld __cnfn as_long8(uint16); -long8 __ovld __cnfn as_long8(long8); -long8 __ovld __cnfn as_long8(ulong8); -long8 __ovld __cnfn as_long8(float16); - -long16 __ovld __cnfn as_long16(long16); -long16 __ovld __cnfn as_long16(ulong16); - -ulong __ovld __cnfn as_ulong(char8); -ulong __ovld __cnfn as_ulong(uchar8); -ulong __ovld __cnfn as_ulong(short3); -ulong __ovld __cnfn as_ulong(short4); -ulong __ovld __cnfn as_ulong(ushort3); -ulong __ovld __cnfn as_ulong(ushort4); -ulong __ovld __cnfn as_ulong(int2); -ulong __ovld __cnfn as_ulong(uint2); -ulong __ovld __cnfn as_ulong(long); -ulong __ovld __cnfn as_ulong(ulong); -ulong __ovld __cnfn as_ulong(float2); - -ulong2 __ovld __cnfn as_ulong2(char16); -ulong2 __ovld __cnfn as_ulong2(uchar16); -ulong2 __ovld __cnfn as_ulong2(short8); -ulong2 __ovld __cnfn as_ulong2(ushort8); -ulong2 __ovld __cnfn as_ulong2(int3); -ulong2 __ovld __cnfn as_ulong2(int4); -ulong2 __ovld __cnfn as_ulong2(uint3); -ulong2 __ovld __cnfn as_ulong2(uint4); -ulong2 __ovld __cnfn as_ulong2(long2); -ulong2 __ovld __cnfn as_ulong2(ulong2); -ulong2 __ovld __cnfn as_ulong2(float3); -ulong2 __ovld __cnfn as_ulong2(float4); - -ulong3 __ovld __cnfn as_ulong3(short16); -ulong3 __ovld __cnfn as_ulong3(ushort16); -ulong3 __ovld __cnfn as_ulong3(int8); -ulong3 __ovld __cnfn as_ulong3(uint8); -ulong3 __ovld __cnfn as_ulong3(long3); -ulong3 __ovld __cnfn as_ulong3(long4); -ulong3 __ovld __cnfn as_ulong3(ulong3); -ulong3 __ovld __cnfn as_ulong3(ulong4); -ulong3 __ovld __cnfn as_ulong3(float8); - -ulong4 __ovld __cnfn as_ulong4(short16); -ulong4 __ovld __cnfn as_ulong4(ushort16); -ulong4 __ovld __cnfn as_ulong4(int8); -ulong4 __ovld __cnfn as_ulong4(uint8); -ulong4 __ovld __cnfn as_ulong4(long3); -ulong4 __ovld __cnfn as_ulong4(long4); -ulong4 __ovld __cnfn as_ulong4(ulong3); -ulong4 __ovld __cnfn as_ulong4(ulong4); -ulong4 __ovld __cnfn as_ulong4(float8); - -ulong8 __ovld __cnfn as_ulong8(int16); -ulong8 __ovld __cnfn as_ulong8(uint16); -ulong8 __ovld __cnfn as_ulong8(long8); -ulong8 __ovld __cnfn as_ulong8(ulong8); -ulong8 __ovld __cnfn as_ulong8(float16); - -ulong16 __ovld __cnfn as_ulong16(long16); -ulong16 __ovld __cnfn as_ulong16(ulong16); - -float __ovld __cnfn as_float(char3); -float __ovld __cnfn as_float(char4); -float __ovld __cnfn as_float(uchar3); -float __ovld __cnfn as_float(uchar4); -float __ovld __cnfn as_float(short2); -float __ovld __cnfn as_float(ushort2); -float __ovld __cnfn as_float(int); -float __ovld __cnfn as_float(uint); -float __ovld __cnfn as_float(float); - -float2 __ovld __cnfn as_float2(char8); -float2 __ovld __cnfn as_float2(uchar8); -float2 __ovld __cnfn as_float2(short3); -float2 __ovld __cnfn as_float2(short4); -float2 __ovld __cnfn as_float2(ushort3); -float2 __ovld __cnfn as_float2(ushort4); -float2 __ovld __cnfn as_float2(int2); -float2 __ovld __cnfn as_float2(uint2); -float2 __ovld __cnfn as_float2(long); -float2 __ovld __cnfn as_float2(ulong); -float2 __ovld __cnfn as_float2(float2); - -float3 __ovld __cnfn as_float3(char16); -float3 __ovld __cnfn as_float3(uchar16); -float3 __ovld __cnfn as_float3(short8); -float3 __ovld __cnfn as_float3(ushort8); -float3 __ovld __cnfn as_float3(int3); -float3 __ovld __cnfn as_float3(int4); -float3 __ovld __cnfn as_float3(uint3); -float3 __ovld __cnfn as_float3(uint4); -float3 __ovld __cnfn as_float3(long2); -float3 __ovld __cnfn as_float3(ulong2); -float3 __ovld __cnfn as_float3(float3); -float3 __ovld __cnfn as_float3(float4); - -float4 __ovld __cnfn as_float4(char16); -float4 __ovld __cnfn as_float4(uchar16); -float4 __ovld __cnfn as_float4(short8); -float4 __ovld __cnfn as_float4(ushort8); -float4 __ovld __cnfn as_float4(int3); -float4 __ovld __cnfn as_float4(int4); -float4 __ovld __cnfn as_float4(uint3); -float4 __ovld __cnfn as_float4(uint4); -float4 __ovld __cnfn as_float4(long2); -float4 __ovld __cnfn as_float4(ulong2); -float4 __ovld __cnfn as_float4(float3); -float4 __ovld __cnfn as_float4(float4); - -float8 __ovld __cnfn as_float8(short16); -float8 __ovld __cnfn as_float8(ushort16); -float8 __ovld __cnfn as_float8(int8); -float8 __ovld __cnfn as_float8(uint8); -float8 __ovld __cnfn as_float8(long3); -float8 __ovld __cnfn as_float8(long4); -float8 __ovld __cnfn as_float8(ulong3); -float8 __ovld __cnfn as_float8(ulong4); -float8 __ovld __cnfn as_float8(float8); - -float16 __ovld __cnfn as_float16(int16); -float16 __ovld __cnfn as_float16(uint16); -float16 __ovld __cnfn as_float16(long8); -float16 __ovld __cnfn as_float16(ulong8); -float16 __ovld __cnfn as_float16(float16); +#define as_char(x) __builtin_astype((x), char) +#define as_char2(x) __builtin_astype((x), char2) +#define as_char3(x) __builtin_astype((x), char3) +#define as_char4(x) __builtin_astype((x), char4) +#define as_char8(x) __builtin_astype((x), char8) +#define as_char16(x) __builtin_astype((x), char16) + +#define as_uchar(x) __builtin_astype((x), uchar) +#define as_uchar2(x) __builtin_astype((x), uchar2) +#define as_uchar3(x) __builtin_astype((x), uchar3) +#define as_uchar4(x) __builtin_astype((x), uchar4) +#define as_uchar8(x) __builtin_astype((x), uchar8) +#define as_uchar16(x) __builtin_astype((x), uchar16) + +#define as_short(x) __builtin_astype((x), short) +#define as_short2(x) __builtin_astype((x), short2) +#define as_short3(x) __builtin_astype((x), short3) +#define as_short4(x) __builtin_astype((x), short4) +#define as_short8(x) __builtin_astype((x), short8) +#define as_short16(x) __builtin_astype((x), short16) + +#define as_ushort(x) __builtin_astype((x), ushort) +#define as_ushort2(x) __builtin_astype((x), ushort2) +#define as_ushort3(x) __builtin_astype((x), ushort3) +#define as_ushort4(x) __builtin_astype((x), ushort4) +#define as_ushort8(x) __builtin_astype((x), ushort8) +#define as_ushort16(x) __builtin_astype((x), ushort16) + +#define as_int(x) __builtin_astype((x), int) +#define as_int2(x) __builtin_astype((x), int2) +#define as_int3(x) __builtin_astype((x), int3) +#define as_int4(x) __builtin_astype((x), int4) +#define as_int8(x) __builtin_astype((x), int8) +#define as_int16(x) __builtin_astype((x), int16) + +#define as_uint(x) __builtin_astype((x), uint) +#define as_uint2(x) __builtin_astype((x), uint2) +#define as_uint3(x) __builtin_astype((x), uint3) +#define as_uint4(x) __builtin_astype((x), uint4) +#define as_uint8(x) __builtin_astype((x), uint8) +#define as_uint16(x) __builtin_astype((x), uint16) + +#define as_long(x) __builtin_astype((x), long) +#define as_long2(x) __builtin_astype((x), long2) +#define as_long3(x) __builtin_astype((x), long3) +#define as_long4(x) __builtin_astype((x), long4) +#define as_long8(x) __builtin_astype((x), long8) +#define as_long16(x) __builtin_astype((x), long16) + +#define as_ulong(x) __builtin_astype((x), ulong) +#define as_ulong2(x) __builtin_astype((x), ulong2) +#define as_ulong3(x) __builtin_astype((x), ulong3) +#define as_ulong4(x) __builtin_astype((x), ulong4) +#define as_ulong8(x) __builtin_astype((x), ulong8) +#define as_ulong16(x) __builtin_astype((x), ulong16) + +#define as_float(x) __builtin_astype((x), float) +#define as_float2(x) __builtin_astype((x), float2) +#define as_float3(x) __builtin_astype((x), float3) +#define as_float4(x) __builtin_astype((x), float4) +#define as_float8(x) __builtin_astype((x), float8) +#define as_float16(x) __builtin_astype((x), float16) #ifdef cl_khr_fp64 -char8 __ovld __cnfn as_char8(double); -char16 __ovld __cnfn as_char16(double2); -uchar8 __ovld __cnfn as_uchar8(double); -uchar16 __ovld __cnfn as_uchar16(double2); -short3 __ovld __cnfn as_short3(double); -short4 __ovld __cnfn as_short4(double); -short8 __ovld __cnfn as_short8(double2); -short16 __ovld __cnfn as_short16(double3); -short16 __ovld __cnfn as_short16(double4); -ushort3 __ovld __cnfn as_ushort3(double); -ushort4 __ovld __cnfn as_ushort4(double); -ushort8 __ovld __cnfn as_ushort8(double2); -ushort16 __ovld __cnfn as_ushort16(double3); -ushort16 __ovld __cnfn as_ushort16(double4); -int2 __ovld __cnfn as_int2(double); -int3 __ovld __cnfn as_int3(double2); -int4 __ovld __cnfn as_int4(double2); -int8 __ovld __cnfn as_int8(double3); -int8 __ovld __cnfn as_int8(double4); -int16 __ovld __cnfn as_int16(double8); -uint2 __ovld __cnfn as_uint2(double); -uint3 __ovld __cnfn as_uint3(double2); -uint4 __ovld __cnfn as_uint4(double2); -uint8 __ovld __cnfn as_uint8(double3); -uint8 __ovld __cnfn as_uint8(double4); -uint16 __ovld __cnfn as_uint16(double8); -long __ovld __cnfn as_long(double); -long2 __ovld __cnfn as_long2(double2); -long3 __ovld __cnfn as_long3(double3); -long3 __ovld __cnfn as_long3(double4); -long4 __ovld __cnfn as_long4(double3); -long4 __ovld __cnfn as_long4(double4); -long8 __ovld __cnfn as_long8(double8); -long16 __ovld __cnfn as_long16(double16); -ulong __ovld __cnfn as_ulong(double); -ulong2 __ovld __cnfn as_ulong2(double2); -ulong3 __ovld __cnfn as_ulong3(double3); -ulong3 __ovld __cnfn as_ulong3(double4); -ulong4 __ovld __cnfn as_ulong4(double3); -ulong4 __ovld __cnfn as_ulong4(double4); -ulong8 __ovld __cnfn as_ulong8(double8); -ulong16 __ovld __cnfn as_ulong16(double16); -float2 __ovld __cnfn as_float2(double); -float3 __ovld __cnfn as_float3(double2); -float4 __ovld __cnfn as_float4(double2); -float8 __ovld __cnfn as_float8(double3); -float8 __ovld __cnfn as_float8(double4); -float16 __ovld __cnfn as_float16(double8); -double __ovld __cnfn as_double(char8); -double __ovld __cnfn as_double(uchar8); -double __ovld __cnfn as_double(short3); -double __ovld __cnfn as_double(short4); -double __ovld __cnfn as_double(ushort3); -double __ovld __cnfn as_double(ushort4); -double __ovld __cnfn as_double(int2); -double __ovld __cnfn as_double(uint2); -double __ovld __cnfn as_double(long); -double __ovld __cnfn as_double(ulong); -double __ovld __cnfn as_double(float2); -double __ovld __cnfn as_double(double); -double2 __ovld __cnfn as_double2(char16); -double2 __ovld __cnfn as_double2(uchar16); -double2 __ovld __cnfn as_double2(short8); -double2 __ovld __cnfn as_double2(ushort8); -double2 __ovld __cnfn as_double2(int3); -double2 __ovld __cnfn as_double2(int4); -double2 __ovld __cnfn as_double2(uint3); -double2 __ovld __cnfn as_double2(uint4); -double2 __ovld __cnfn as_double2(long2); -double2 __ovld __cnfn as_double2(ulong2); -double2 __ovld __cnfn as_double2(float3); -double2 __ovld __cnfn as_double2(float4); -double2 __ovld __cnfn as_double2(double2); -double3 __ovld __cnfn as_double3(short16); -double3 __ovld __cnfn as_double3(ushort16); -double3 __ovld __cnfn as_double3(int8); -double3 __ovld __cnfn as_double3(uint8); -double3 __ovld __cnfn as_double3(long3); -double3 __ovld __cnfn as_double3(long4); -double3 __ovld __cnfn as_double3(ulong3); -double3 __ovld __cnfn as_double3(ulong4); -double3 __ovld __cnfn as_double3(float8); -double3 __ovld __cnfn as_double3(double3); -double3 __ovld __cnfn as_double3(double4); -double4 __ovld __cnfn as_double4(short16); -double4 __ovld __cnfn as_double4(ushort16); -double4 __ovld __cnfn as_double4(int8); -double4 __ovld __cnfn as_double4(uint8); -double4 __ovld __cnfn as_double4(long3); -double4 __ovld __cnfn as_double4(long4); -double4 __ovld __cnfn as_double4(ulong3); -double4 __ovld __cnfn as_double4(ulong4); -double4 __ovld __cnfn as_double4(float8); -double4 __ovld __cnfn as_double4(double3); -double4 __ovld __cnfn as_double4(double4); -double8 __ovld __cnfn as_double8(int16); -double8 __ovld __cnfn as_double8(uint16); -double8 __ovld __cnfn as_double8(long8); -double8 __ovld __cnfn as_double8(ulong8); -double8 __ovld __cnfn as_double8(float16); -double8 __ovld __cnfn as_double8(double8); -double16 __ovld __cnfn as_double16(long16); -double16 __ovld __cnfn as_double16(ulong16); -double16 __ovld __cnfn as_double16(double16); +#define as_double(x) __builtin_astype((x), double) +#define as_double2(x) __builtin_astype((x), double2) +#define as_double3(x) __builtin_astype((x), double3) +#define as_double4(x) __builtin_astype((x), double4) +#define as_double8(x) __builtin_astype((x), double8) +#define as_double16(x) __builtin_astype((x), double16) #endif //cl_khr_fp64 #ifdef cl_khr_fp16 -char2 __ovld __cnfn as_char2(half); -char3 __ovld __cnfn as_char3(half2); -char4 __ovld __cnfn as_char4(half2); -char8 __ovld __cnfn as_char8(half3); -char8 __ovld __cnfn as_char8(half4); -char16 __ovld __cnfn as_char16(half8); -uchar2 __ovld __cnfn as_uchar2(half); -uchar3 __ovld __cnfn as_uchar3(half2); -uchar4 __ovld __cnfn as_uchar4(half2); -uchar8 __ovld __cnfn as_uchar8(half3); -uchar8 __ovld __cnfn as_uchar8(half4); -uchar16 __ovld __cnfn as_uchar16(half8); -short __ovld __cnfn as_short(half); -short2 __ovld __cnfn as_short2(half2); -short3 __ovld __cnfn as_short3(half3); -short3 __ovld __cnfn as_short3(half4); -short4 __ovld __cnfn as_short4(half3); -short4 __ovld __cnfn as_short4(half4); -short8 __ovld __cnfn as_short8(half8); -short16 __ovld __cnfn as_short16(half16); -ushort __ovld __cnfn as_ushort(half); -ushort2 __ovld __cnfn as_ushort2(half2); -ushort3 __ovld __cnfn as_ushort3(half3); -ushort3 __ovld __cnfn as_ushort3(half4); -ushort4 __ovld __cnfn as_ushort4(half3); -ushort4 __ovld __cnfn as_ushort4(half4); -ushort8 __ovld __cnfn as_ushort8(half8); -ushort16 __ovld __cnfn as_ushort16(half16); -int __ovld __cnfn as_int(half2); -int2 __ovld __cnfn as_int2(half3); -int2 __ovld __cnfn as_int2(half4); -int3 __ovld __cnfn as_int3(half8); -int4 __ovld __cnfn as_int4(half8); -int8 __ovld __cnfn as_int8(half16); -uint __ovld __cnfn as_uint(half2); -uint2 __ovld __cnfn as_uint2(half3); -uint2 __ovld __cnfn as_uint2(half4); -uint3 __ovld __cnfn as_uint3(half8); -uint4 __ovld __cnfn as_uint4(half8); -uint8 __ovld __cnfn as_uint8(half16); -long __ovld __cnfn as_long(half3); -long __ovld __cnfn as_long(half4); -long2 __ovld __cnfn as_long2(half8); -long3 __ovld __cnfn as_long3(half16); -long4 __ovld __cnfn as_long4(half16); -ulong __ovld __cnfn as_ulong(half3); -ulong __ovld __cnfn as_ulong(half4); -ulong2 __ovld __cnfn as_ulong2(half8); -ulong3 __ovld __cnfn as_ulong3(half16); -ulong4 __ovld __cnfn as_ulong4(half16); -half __ovld __cnfn as_half(char2); -half __ovld __cnfn as_half(uchar2); -half __ovld __cnfn as_half(short); -half __ovld __cnfn as_half(ushort); -half __ovld __cnfn as_half(half); -half2 __ovld __cnfn as_half2(char3); -half2 __ovld __cnfn as_half2(char4); -half2 __ovld __cnfn as_half2(uchar3); -half2 __ovld __cnfn as_half2(uchar4); -half2 __ovld __cnfn as_half2(short2); -half2 __ovld __cnfn as_half2(ushort2); -half2 __ovld __cnfn as_half2(int); -half2 __ovld __cnfn as_half2(uint); -half2 __ovld __cnfn as_half2(half2); -half2 __ovld __cnfn as_half2(float); -half3 __ovld __cnfn as_half3(char8); -half3 __ovld __cnfn as_half3(uchar8); -half3 __ovld __cnfn as_half3(short3); -half3 __ovld __cnfn as_half3(short4); -half3 __ovld __cnfn as_half3(ushort3); -half3 __ovld __cnfn as_half3(ushort4); -half3 __ovld __cnfn as_half3(int2); -half3 __ovld __cnfn as_half3(uint2); -half3 __ovld __cnfn as_half3(long); -half3 __ovld __cnfn as_half3(ulong); -half3 __ovld __cnfn as_half3(half3); -half3 __ovld __cnfn as_half3(half4); -half3 __ovld __cnfn as_half3(float2); -half4 __ovld __cnfn as_half4(char8); -half4 __ovld __cnfn as_half4(uchar8); -half4 __ovld __cnfn as_half4(short3); -half4 __ovld __cnfn as_half4(short4); -half4 __ovld __cnfn as_half4(ushort3); -half4 __ovld __cnfn as_half4(ushort4); -half4 __ovld __cnfn as_half4(int2); -half4 __ovld __cnfn as_half4(uint2); -half4 __ovld __cnfn as_half4(long); -half4 __ovld __cnfn as_half4(ulong); -half4 __ovld __cnfn as_half4(half3); -half4 __ovld __cnfn as_half4(half4); -half4 __ovld __cnfn as_half4(float2); -half8 __ovld __cnfn as_half8(char16); -half8 __ovld __cnfn as_half8(uchar16); -half8 __ovld __cnfn as_half8(short8); -half8 __ovld __cnfn as_half8(ushort8); -half8 __ovld __cnfn as_half8(int3); -half8 __ovld __cnfn as_half8(int4); -half8 __ovld __cnfn as_half8(uint3); -half8 __ovld __cnfn as_half8(uint4); -half8 __ovld __cnfn as_half8(long2); -half8 __ovld __cnfn as_half8(ulong2); -half8 __ovld __cnfn as_half8(half8); -half8 __ovld __cnfn as_half8(float3); -half8 __ovld __cnfn as_half8(float4); -half16 __ovld __cnfn as_half16(short16); -half16 __ovld __cnfn as_half16(ushort16); -half16 __ovld __cnfn as_half16(int8); -half16 __ovld __cnfn as_half16(uint8); -half16 __ovld __cnfn as_half16(long3); -half16 __ovld __cnfn as_half16(long4); -half16 __ovld __cnfn as_half16(ulong3); -half16 __ovld __cnfn as_half16(ulong4); -half16 __ovld __cnfn as_half16(half16); -half16 __ovld __cnfn as_half16(float8); -float __ovld __cnfn as_float(half2); -float2 __ovld __cnfn as_float2(half3); -float2 __ovld __cnfn as_float2(half4); -float3 __ovld __cnfn as_float3(half8); -float4 __ovld __cnfn as_float4(half8); -float8 __ovld __cnfn as_float8(half16); - -#ifdef cl_khr_fp64 -half3 __ovld __cnfn as_half3(double); -half4 __ovld __cnfn as_half4(double); -half8 __ovld __cnfn as_half8(double2); -half16 __ovld __cnfn as_half16(double3); -half16 __ovld __cnfn as_half16(double4); -double __ovld __cnfn as_double(half3); -double __ovld __cnfn as_double(half4); -double2 __ovld __cnfn as_double2(half8); -double3 __ovld __cnfn as_double3(half16); -double4 __ovld __cnfn as_double4(half16); -#endif //cl_khr_fp64 +#define as_half(x) __builtin_astype((x), half) +#define as_half2(x) __builtin_astype((x), half2) +#define as_half3(x) __builtin_astype((x), half3) +#define as_half4(x) __builtin_astype((x), half4) +#define as_half8(x) __builtin_astype((x), half8) +#define as_half16(x) __builtin_astype((x), half16) #endif //cl_khr_fp16 // OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers @@ -14389,10 +13703,10 @@ float __ovld atomic_xchg(volatile __local float *p, float val); #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_xchg(volatile __global int *p, int val); -int __ovld atom_xchg(volatile __local int *p, int val); +unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val); #endif #if defined(cl_khr_local_int32_base_atomics) -unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val); +int __ovld atom_xchg(volatile __local int *p, int val); unsigned int __ovld atom_xchg(volatile __local unsigned int *p, unsigned int val); #endif @@ -14509,8 +13823,6 @@ unsigned int __ovld atom_min(volatile __local unsigned int *p, unsigned int val) #if defined(cl_khr_int64_extended_atomics) long __ovld atom_min(volatile __global long *p, long val); unsigned long __ovld atom_min(volatile __global unsigned long *p, unsigned long val); -#endif -#if defined(cl_khr_local_int32_extended_atomics) long __ovld atom_min(volatile __local long *p, long val); unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long val); #endif @@ -15650,6 +14962,7 @@ float __purefn __ovld read_imagef(read_only image2d_array_msaa_depth_t image, in #endif //cl_khr_gl_msaa_sharing // OpenCL Extension v2.0 s9.18 - Mipmaps +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 #ifdef cl_khr_mipmap_image float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod); @@ -15725,6 +15038,7 @@ int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, f uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod); #endif //cl_khr_mipmap_image +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 /** * Sampler-less Image Access @@ -15823,6 +15137,7 @@ float __purefn __ovld read_imagef(read_write image2d_msaa_depth_t image, int2 co float __purefn __ovld read_imagef(read_write image2d_array_msaa_depth_t image, int4 coord, int sample); #endif //cl_khr_gl_msaa_sharing +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 #ifdef cl_khr_mipmap_image float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod); int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod); @@ -15896,6 +15211,7 @@ float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod); uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod); #endif //cl_khr_mipmap_image +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 // Image read functions returning half4 type #ifdef cl_khr_fp16 @@ -15995,9 +15311,11 @@ void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, flo void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int4 color); void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, uint4 color); +#ifdef cl_khr_3d_image_writes void __ovld write_imagef(write_only image3d_t image, int4 coord, float4 color); void __ovld write_imagei(write_only image3d_t image, int4 coord, int4 color); void __ovld write_imageui(write_only image3d_t image, int4 coord, uint4 color); +#endif #ifdef cl_khr_depth_images void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, float color); @@ -16005,6 +15323,7 @@ void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, flo #endif //cl_khr_depth_images // OpenCL Extension v2.0 s9.18 - Mipmaps +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 #ifdef cl_khr_mipmap_image void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color); void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color); @@ -16025,16 +15344,21 @@ void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, in void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color); void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color); +#ifdef cl_khr_3d_image_writes void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color); void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color); void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color); +#endif #endif //cl_khr_mipmap_image +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 // Image write functions for half4 type #ifdef cl_khr_fp16 void __ovld write_imageh(write_only image1d_t image, int coord, half4 color); void __ovld write_imageh(write_only image2d_t image, int2 coord, half4 color); +#ifdef cl_khr_3d_image_writes void __ovld write_imageh(write_only image3d_t image, int4 coord, half4 color); +#endif void __ovld write_imageh(write_only image1d_array_t image, int2 coord, half4 color); void __ovld write_imageh(write_only image2d_array_t image, int4 coord, half4 color); void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 color); @@ -16062,15 +15386,18 @@ void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, flo void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int4 color); void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, uint4 color); +#ifdef cl_khr_3d_image_writes void __ovld write_imagef(read_write image3d_t image, int4 coord, float4 color); void __ovld write_imagei(read_write image3d_t image, int4 coord, int4 color); void __ovld write_imageui(read_write image3d_t image, int4 coord, uint4 color); +#endif #ifdef cl_khr_depth_images void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, float color); void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, float color); #endif //cl_khr_depth_images +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 #ifdef cl_khr_mipmap_image void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color); void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color); @@ -16091,16 +15418,21 @@ void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, in void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, int lod, float color); void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int lod, float color); +#ifdef cl_khr_3d_image_writes void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color); void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color); void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color); +#endif #endif //cl_khr_mipmap_image +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 // Image write functions for half4 type #ifdef cl_khr_fp16 void __ovld write_imageh(read_write image1d_t image, int coord, half4 color); void __ovld write_imageh(read_write image2d_t image, int2 coord, half4 color); +#ifdef cl_khr_3d_image_writes void __ovld write_imageh(read_write image3d_t image, int4 coord, half4 color); +#endif void __ovld write_imageh(read_write image1d_array_t image, int2 coord, half4 color); void __ovld write_imageh(read_write image2d_array_t image, int4 coord, half4 color); void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 color); @@ -16118,7 +15450,9 @@ void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 col int __ovld __cnfn get_image_width(read_only image1d_t image); int __ovld __cnfn get_image_width(read_only image1d_buffer_t image); int __ovld __cnfn get_image_width(read_only image2d_t image); +#ifdef cl_khr_3d_image_writes int __ovld __cnfn get_image_width(read_only image3d_t image); +#endif int __ovld __cnfn get_image_width(read_only image1d_array_t image); int __ovld __cnfn get_image_width(read_only image2d_array_t image); #ifdef cl_khr_depth_images @@ -16135,7 +15469,9 @@ int __ovld __cnfn get_image_width(read_only image2d_array_msaa_depth_t image); int __ovld __cnfn get_image_width(write_only image1d_t image); int __ovld __cnfn get_image_width(write_only image1d_buffer_t image); int __ovld __cnfn get_image_width(write_only image2d_t image); +#ifdef cl_khr_3d_image_writes int __ovld __cnfn get_image_width(write_only image3d_t image); +#endif int __ovld __cnfn get_image_width(write_only image1d_array_t image); int __ovld __cnfn get_image_width(write_only image2d_array_t image); #ifdef cl_khr_depth_images @@ -16186,7 +15522,9 @@ int __ovld __cnfn get_image_height(read_only image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing int __ovld __cnfn get_image_height(write_only image2d_t image); +#ifdef cl_khr_3d_image_writes int __ovld __cnfn get_image_height(write_only image3d_t image); +#endif int __ovld __cnfn get_image_height(write_only image2d_array_t image); #ifdef cl_khr_depth_images int __ovld __cnfn get_image_height(write_only image2d_depth_t image); @@ -16220,13 +15558,16 @@ int __ovld __cnfn get_image_height(read_write image2d_array_msaa_depth_t image); */ int __ovld __cnfn get_image_depth(read_only image3d_t image); +#ifdef cl_khr_3d_image_writes int __ovld __cnfn get_image_depth(write_only image3d_t image); +#endif #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 int __ovld __cnfn get_image_depth(read_write image3d_t image); #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 // OpenCL Extension v2.0 s9.18 - Mipmaps +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 #ifdef cl_khr_mipmap_image /** * Return the image miplevels. @@ -16238,13 +15579,13 @@ int __ovld get_image_num_mip_levels(read_only image3d_t image); int __ovld get_image_num_mip_levels(write_only image1d_t image); int __ovld get_image_num_mip_levels(write_only image2d_t image); +#ifdef cl_khr_3d_image_writes int __ovld get_image_num_mip_levels(write_only image3d_t image); +#endif -#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 int __ovld get_image_num_mip_levels(read_write image1d_t image); int __ovld get_image_num_mip_levels(read_write image2d_t image); int __ovld get_image_num_mip_levels(read_write image3d_t image); -#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 int __ovld get_image_num_mip_levels(read_only image1d_array_t image); int __ovld get_image_num_mip_levels(read_only image2d_array_t image); @@ -16256,14 +15597,13 @@ int __ovld get_image_num_mip_levels(write_only image2d_array_t image); int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t image); int __ovld get_image_num_mip_levels(write_only image2d_depth_t image); -#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 int __ovld get_image_num_mip_levels(read_write image1d_array_t image); int __ovld get_image_num_mip_levels(read_write image2d_array_t image); int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t image); int __ovld get_image_num_mip_levels(read_write image2d_depth_t image); -#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 #endif //cl_khr_mipmap_image +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 /** * Return the channel data type. Valid values are: @@ -16324,7 +15664,9 @@ int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_depth int __ovld __cnfn get_image_channel_data_type(write_only image1d_t image); int __ovld __cnfn get_image_channel_data_type(write_only image1d_buffer_t image); int __ovld __cnfn get_image_channel_data_type(write_only image2d_t image); +#ifdef cl_khr_3d_image_writes int __ovld __cnfn get_image_channel_data_type(write_only image3d_t image); +#endif int __ovld __cnfn get_image_channel_data_type(write_only image1d_array_t image); int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_t image); #ifdef cl_khr_depth_images @@ -16418,7 +15760,9 @@ int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_depth_t i int __ovld __cnfn get_image_channel_order(write_only image1d_t image); int __ovld __cnfn get_image_channel_order(write_only image1d_buffer_t image); int __ovld __cnfn get_image_channel_order(write_only image2d_t image); +#ifdef cl_khr_3d_image_writes int __ovld __cnfn get_image_channel_order(write_only image3d_t image); +#endif int __ovld __cnfn get_image_channel_order(write_only image1d_array_t image); int __ovld __cnfn get_image_channel_order(write_only image2d_array_t image); #ifdef cl_khr_depth_images @@ -16504,7 +15848,9 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image); * component and the w component is 0. */ int4 __ovld __cnfn get_image_dim(read_only image3d_t image); +#ifdef cl_khr_3d_image_writes int4 __ovld __cnfn get_image_dim(write_only image3d_t image); +#endif #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 int4 __ovld __cnfn get_image_dim(read_write image3d_t image); #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 @@ -16714,16 +16060,12 @@ typedef int clk_profiling_info; #define MAX_WORK_DIM 3 -// ToDo: Remove definition of ndrange_t in Clang as an opaque type and add back -// the following ndrange_t definition. -#if 0 typedef struct { unsigned int workDimension; size_t globalWorkOffset[MAX_WORK_DIM]; size_t globalWorkSize[MAX_WORK_DIM]; size_t localWorkSize[MAX_WORK_DIM]; } ndrange_t; -#endif ndrange_t __ovld ndrange_1D(size_t); ndrange_t __ovld ndrange_1D(size_t, size_t); diff --git a/contrib/llvm/tools/clang/lib/Headers/pmmintrin.h b/contrib/llvm/tools/clang/lib/Headers/pmmintrin.h index d4f6487..559ece2 100644 --- a/contrib/llvm/tools/clang/lib/Headers/pmmintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/pmmintrin.h @@ -31,9 +31,11 @@ __attribute__((__always_inline__, __nodebug__, __target__("sse3"))) /// \brief Loads data from an unaligned memory location to elements in a 128-bit -/// vector. If the address of the data is not 16-byte aligned, the -/// instruction may read two adjacent aligned blocks of memory to retrieve -/// the requested data. +/// vector. +/// +/// If the address of the data is not 16-byte aligned, the instruction may +/// read two adjacent aligned blocks of memory to retrieve the requested +/// data. /// /// \headerfile <x86intrin.h> /// @@ -115,7 +117,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b) /// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit /// vector of [4 x float] to float values stored in a 128-bit vector of -/// [4 x float]. +/// [4 x float]. /// /// \headerfile <x86intrin.h> /// @@ -136,7 +138,7 @@ _mm_movehdup_ps(__m128 __a) } /// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of -/// [4 x float] to float values stored in a 128-bit vector of [4 x float]. +/// [4 x float] to float values stored in a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> /// @@ -257,14 +259,6 @@ _mm_movedup_pd(__m128d __a) return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); } -#define _MM_DENORMALS_ZERO_ON (0x0040) -#define _MM_DENORMALS_ZERO_OFF (0x0000) - -#define _MM_DENORMALS_ZERO_MASK (0x0040) - -#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) -#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) - /// \brief Establishes a linear address memory range to be monitored and puts /// the processor in the monitor event pending state. Data stored in the /// monitored address range causes the processor to exit the pending state. diff --git a/contrib/llvm/tools/clang/lib/Headers/prfchwintrin.h b/contrib/llvm/tools/clang/lib/Headers/prfchwintrin.h index ba02857..b52f31d 100644 --- a/contrib/llvm/tools/clang/lib/Headers/prfchwintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/prfchwintrin.h @@ -29,12 +29,38 @@ #define __PRFCHWINTRIN_H #if defined(__PRFCHW__) || defined(__3dNOW__) +/// \brief Loads a memory sequence containing the specified memory address into +/// all data cache levels. The cache-coherency state is set to exclusive. +/// Data can be read from and written to the cache line without additional +/// delay. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the \c PREFETCHT0 instruction. +/// +/// \param __P +/// A pointer specifying the memory address to be prefetched. static __inline__ void __attribute__((__always_inline__, __nodebug__)) _m_prefetch(void *__P) { __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */); } +/// \brief Loads a memory sequence containing the specified memory address into +/// the L1 data cache and sets the cache-coherency to modified. This +/// provides a hint to the processor that the cache line will be modified. +/// It is intended for use when the cache line will be written to shortly +/// after the prefetch is performed. +/// +/// Note that the effect of this intrinsic is dependent on the processor +/// implementation. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the \c PREFETCHW instruction. +/// +/// \param __P +/// A pointer specifying the memory address to be prefetched. static __inline__ void __attribute__((__always_inline__, __nodebug__)) _m_prefetchw(void *__P) { diff --git a/contrib/llvm/tools/clang/lib/Headers/smmintrin.h b/contrib/llvm/tools/clang/lib/Headers/smmintrin.h index e48ab03..c2fa5a4 100644 --- a/contrib/llvm/tools/clang/lib/Headers/smmintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/smmintrin.h @@ -46,37 +46,379 @@ #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) +/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an +/// integer and returns the rounded values in a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm_ceil_ps(__m128 X); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float] values to be rounded up. +/// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) + +/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an +/// integer and returns the rounded values in a 128-bit vector of +/// [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm_ceil_pd(__m128d X); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double] values to be rounded up. +/// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) + +/// \brief Copies three upper elements of the first 128-bit vector operand to +/// the corresponding three upper elements of the 128-bit result vector of +/// [4 x float]. Rounds up the lowest element of the second 128-bit vector +/// operand to an integer and copies it to the lowest element of the 128-bit +/// result vector of [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm_ceil_ss(__m128 X, __m128 Y); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is +/// rounded up to the nearest integer and copied to the corresponding bits +/// of the result. +/// \returns A 128-bit vector of [4 x float] containing the copied and rounded +/// values. #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) + +/// \brief Copies the upper element of the first 128-bit vector operand to the +/// corresponding upper element of the 128-bit result vector of [2 x double]. +/// Rounds up the lower element of the second 128-bit vector operand to an +/// integer and copies it to the lower element of the 128-bit result vector +/// of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm_ceil_sd(__m128d X, __m128d Y); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is +/// rounded up to the nearest integer and copied to the corresponding bits +/// of the result. +/// \returns A 128-bit vector of [2 x double] containing the copied and rounded +/// values. #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) +/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an +/// an integer and returns the rounded values in a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm_floor_ps(__m128 X); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float] values to be rounded down. +/// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) + +/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an +/// integer and returns the rounded values in a 128-bit vector of +/// [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm_floor_pd(__m128d X); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) + +/// \brief Copies three upper elements of the first 128-bit vector operand to +/// the corresponding three upper elements of the 128-bit result vector of +/// [4 x float]. Rounds down the lowest element of the second 128-bit vector +/// operand to an integer and copies it to the lowest element of the 128-bit +/// result vector of [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm_floor_ss(__m128 X, __m128 Y); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is +/// rounded down to the nearest integer and copied to the corresponding bits +/// of the result. +/// \returns A 128-bit vector of [4 x float] containing the copied and rounded +/// values. #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) + +/// \brief Copies the upper element of the first 128-bit vector operand to the +/// corresponding upper element of the 128-bit result vector of [2 x double]. +/// Rounds down the lower element of the second 128-bit vector operand to an +/// integer and copies it to the lower element of the 128-bit result vector +/// of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm_floor_sd(__m128d X, __m128d Y); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is +/// rounded down to the nearest integer and copied to the corresponding bits +/// of the result. +/// \returns A 128-bit vector of [2 x double] containing the copied and rounded +/// values. #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) +/// \brief Rounds each element of the 128-bit vector of [4 x float] to an +/// integer value according to the rounding control specified by the second +/// argument and returns the rounded values in a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm_round_ps(__m128 X, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used \n +/// 1: The PE field is not updated \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M \n +/// 1: Use the current MXCSR setting \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest \n +/// 01: Downward (toward negative infinity) \n +/// 10: Upward (toward positive infinity) \n +/// 11: Truncated +/// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_round_ps(X, M) __extension__ ({ \ (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); }) +/// \brief Copies three upper elements of the first 128-bit vector operand to +/// the corresponding three upper elements of the 128-bit result vector of +/// [4 x float]. Rounds the lowest element of the second 128-bit vector +/// operand to an integer value according to the rounding control specified +/// by the third argument and copies it to the lowest element of the 128-bit +/// result vector of [4 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is +/// rounded to the nearest integer using the specified rounding control and +/// copied to the corresponding bits of the result. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used \n +/// 1: The PE field is not updated \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M \n +/// 1: Use the current MXCSR setting \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest \n +/// 01: Downward (toward negative infinity) \n +/// 10: Upward (toward positive infinity) \n +/// 11: Truncated +/// \returns A 128-bit vector of [4 x float] containing the copied and rounded +/// values. #define _mm_round_ss(X, Y, M) __extension__ ({ \ (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (M)); }) +/// \brief Rounds each element of the 128-bit vector of [2 x double] to an +/// integer value according to the rounding control specified by the second +/// argument and returns the rounded values in a 128-bit vector of +/// [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm_round_pd(__m128d X, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used \n +/// 1: The PE field is not updated \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M \n +/// 1: Use the current MXCSR setting \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest \n +/// 01: Downward (toward negative infinity) \n +/// 10: Upward (toward positive infinity) \n +/// 11: Truncated +/// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_round_pd(X, M) __extension__ ({ \ (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); }) +/// \brief Copies the upper element of the first 128-bit vector operand to the +/// corresponding upper element of the 128-bit result vector of [2 x double]. +/// Rounds the lower element of the second 128-bit vector operand to an +/// integer value according to the rounding control specified by the third +/// argument and copies it to the lower element of the 128-bit result vector +/// of [2 x double]. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is +/// rounded to the nearest integer using the specified rounding control and +/// copied to the corresponding bits of the result. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used \n +/// 1: The PE field is not updated \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M \n +/// 1: Use the current MXCSR setting \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest \n +/// 01: Downward (toward negative infinity) \n +/// 10: Upward (toward positive infinity) \n +/// 11: Truncated +/// \returns A 128-bit vector of [2 x double] containing the copied and rounded +/// values. #define _mm_round_sd(X, Y, M) __extension__ ({ \ (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (M)); }) /* SSE4 Packed Blending Intrinsics. */ +/// \brief Returns a 128-bit vector of [2 x double] where the values are +/// selected from either the first or second operand as specified by the +/// third operand, the control mask. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. +/// +/// \param V1 +/// A 128-bit vector of [2 x double]. +/// \param V2 +/// A 128-bit vector of [2 x double]. +/// \param M +/// An immediate integer operand, with mask bits [1:0] specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// index of a copied value. When a mask bit is 0, the corresponding 64-bit +/// element in operand \a V1 is copied to the same position in the result. +/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 +/// is copied to the same position in the result. +/// \returns A 128-bit vector of [2 x double] containing the copied values. #define _mm_blend_pd(V1, V2, M) __extension__ ({ \ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ (__v2df)(__m128d)(V2), \ (((M) & 0x01) ? 2 : 0), \ (((M) & 0x02) ? 3 : 1)); }) +/// \brief Returns a 128-bit vector of [4 x float] where the values are selected +/// from either the first or second operand as specified by the third +/// operand, the control mask. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. +/// +/// \param V1 +/// A 128-bit vector of [4 x float]. +/// \param V2 +/// A 128-bit vector of [4 x float]. +/// \param M +/// An immediate integer operand, with mask bits [3:0] specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// index of a copied value. When a mask bit is 0, the corresponding 32-bit +/// element in operand \a V1 is copied to the same position in the result. +/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 +/// is copied to the same position in the result. +/// \returns A 128-bit vector of [4 x float] containing the copied values. #define _mm_blend_ps(V1, V2, M) __extension__ ({ \ (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ (((M) & 0x01) ? 4 : 0), \ @@ -84,6 +426,26 @@ (((M) & 0x04) ? 6 : 2), \ (((M) & 0x08) ? 7 : 3)); }) +/// \brief Returns a 128-bit vector of [2 x double] where the values are +/// selected from either the first or second operand as specified by the +/// third operand, the control mask. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [2 x double]. +/// \param __V2 +/// A 128-bit vector of [2 x double]. +/// \param __M +/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// most significant bit of a copied value. When a mask bit is 0, the +/// corresponding 64-bit element in operand \a __V1 is copied to the same +/// position in the result. When a mask bit is 1, the corresponding 64-bit +/// element in operand \a __V2 is copied to the same position in the result. +/// \returns A 128-bit vector of [2 x double] containing the copied values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) { @@ -91,6 +453,26 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) (__v2df)__M); } +/// \brief Returns a 128-bit vector of [4 x float] where the values are +/// selected from either the first or second operand as specified by the +/// third operand, the control mask. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x float]. +/// \param __V2 +/// A 128-bit vector of [4 x float]. +/// \param __M +/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying +/// how the values are to be copied. The position of the mask bit corresponds +/// to the most significant bit of a copied value. When a mask bit is 0, the +/// corresponding 32-bit element in operand \a __V1 is copied to the same +/// position in the result. When a mask bit is 1, the corresponding 32-bit +/// element in operand \a __V2 is copied to the same position in the result. +/// \returns A 128-bit vector of [4 x float] containing the copied values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) { @@ -98,6 +480,26 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) (__v4sf)__M); } +/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected +/// from either of the first or second operand as specified by the third +/// operand, the control mask. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [16 x i8]. +/// \param __V2 +/// A 128-bit vector of [16 x i8]. +/// \param __M +/// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying +/// how the values are to be copied. The position of the mask bit corresponds +/// to the most significant bit of a copied value. When a mask bit is 0, the +/// corresponding 8-bit element in operand \a __V1 is copied to the same +/// position in the result. When a mask bit is 1, the corresponding 8-bit +/// element in operand \a __V2 is copied to the same position in the result. +/// \returns A 128-bit vector of [16 x i8] containing the copied values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) { @@ -105,6 +507,30 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) (__v16qi)__M); } +/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected +/// from either of the first or second operand as specified by the third +/// operand, the control mask. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. +/// +/// \param V1 +/// A 128-bit vector of [8 x i16]. +/// \param V2 +/// A 128-bit vector of [8 x i16]. +/// \param M +/// An immediate integer operand, with mask bits [7:0] specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// index of a copied value. When a mask bit is 0, the corresponding 16-bit +/// element in operand \a V1 is copied to the same position in the result. +/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 +/// is copied to the same position in the result. +/// \returns A 128-bit vector of [8 x i16] containing the copied values. #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ (__v8hi)(__m128i)(V2), \ @@ -118,12 +544,39 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) (((M) & 0x80) ? 15 : 7)); }) /* SSE4 Dword Multiply Instructions. */ +/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32] +/// and returns the lower 32 bits of the each product in a 128-bit vector of +/// [4 x i32]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. +/// +/// \param __V1 +/// A 128-bit integer vector. +/// \param __V2 +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the products of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32 (__m128i __V1, __m128i __V2) { return (__m128i) ((__v4su)__V1 * (__v4su)__V2); } +/// \brief Multiplies corresponding even-indexed elements of two 128-bit +/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] +/// containing the products. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32]. +/// \param __V2 +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit vector of [2 x i64] containing the products of both +/// operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32 (__m128i __V1, __m128i __V2) { @@ -131,64 +584,243 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) } /* SSE4 Floating Point Dot Product Instructions. */ +/// \brief Computes the dot product of the two 128-bit vectors of [4 x float] +/// and returns it in the elements of the 128-bit result vector of +/// [4 x float]. +/// +/// The immediate integer operand controls which input elements +/// will contribute to the dot product, and where the final results are +/// returned. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. +/// \param Y +/// A 128-bit vector of [4 x float]. +/// \param M +/// An immediate integer operand. Mask bits [7:4] determine which elements +/// of the input vectors are used, with bit [4] corresponding to the lowest +/// element and bit [7] corresponding to the highest element of each [4 x +/// float] vector. If a bit is set, the corresponding elements from the two +/// input vectors are used as an input for dot product; otherwise that input +/// is treated as zero. Bits [3:0] determine which elements of the result +/// will receive a copy of the final dot product, with bit [0] corresponding +/// to the lowest element and bit [3] corresponding to the highest element of +/// each [4 x float] subvector. If a bit is set, the dot product is returned +/// in the corresponding element; otherwise that element is set to zero. +/// \returns A 128-bit vector of [4 x float] containing the dot product. #define _mm_dp_ps(X, Y, M) __extension__ ({ \ (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (M)); }) +/// \brief Computes the dot product of the two 128-bit vectors of [2 x double] +/// and returns it in the elements of the 128-bit result vector of +/// [2 x double]. +/// +/// The immediate integer operand controls which input +/// elements will contribute to the dot product, and where the final results +/// are returned. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. +/// \param Y +/// A 128-bit vector of [2 x double]. +/// \param M +/// An immediate integer operand. Mask bits [5:4] determine which elements +/// of the input vectors are used, with bit [4] corresponding to the lowest +/// element and bit [5] corresponding to the highest element of each of [2 x +/// double] vector. If a bit is set, the corresponding elements from the two +/// input vectors are used as an input for dot product; otherwise that input +/// is treated as zero. Bits [1:0] determine which elements of the result +/// will receive a copy of the final dot product, with bit [0] corresponding +/// to the lowest element and bit [3] corresponding to the highest element of +/// each [2 x double] vector. If a bit is set, the dot product is returned in +/// the corresponding element; otherwise that element is set to zero. #define _mm_dp_pd(X, Y, M) __extension__ ({\ (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (M)); }) /* SSE4 Streaming Load Hint Instruction. */ +/// \brief Loads integer values from a 128-bit aligned memory location to a +/// 128-bit integer vector. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. +/// +/// \param __V +/// A pointer to a 128-bit aligned memory location that contains the integer +/// values. +/// \returns A 128-bit integer vector containing the data stored at the +/// specified memory location. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_stream_load_si128 (__m128i const *__V) { - return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V); + return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); } /* SSE4 Packed Integer Min/Max Instructions. */ +/// \brief Compares the corresponding elements of two 128-bit vectors of +/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser +/// of the two values. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [16 x i8]. +/// \param __V2 +/// A 128-bit vector of [16 x i8] +/// \returns A 128-bit vector of [16 x i8] containing the lesser values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); } +/// \brief Compares the corresponding elements of two 128-bit vectors of +/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the +/// greater value of the two. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [16 x i8]. +/// \param __V2 +/// A 128-bit vector of [16 x i8]. +/// \returns A 128-bit vector of [16 x i8] containing the greater values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); } +/// \brief Compares the corresponding elements of two 128-bit vectors of +/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser +/// value of the two. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [8 x u16]. +/// \param __V2 +/// A 128-bit vector of [8 x u16]. +/// \returns A 128-bit vector of [8 x u16] containing the lesser values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); } +/// \brief Compares the corresponding elements of two 128-bit vectors of +/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the +/// greater value of the two. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [8 x u16]. +/// \param __V2 +/// A 128-bit vector of [8 x u16]. +/// \returns A 128-bit vector of [8 x u16] containing the greater values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); } +/// \brief Compares the corresponding elements of two 128-bit vectors of +/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser +/// value of the two. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32]. +/// \param __V2 +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit vector of [4 x i32] containing the lesser values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); } +/// \brief Compares the corresponding elements of two 128-bit vectors of +/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the +/// greater value of the two. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32]. +/// \param __V2 +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit vector of [4 x i32] containing the greater values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); } +/// \brief Compares the corresponding elements of two 128-bit vectors of +/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser +/// value of the two. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x u32]. +/// \param __V2 +/// A 128-bit vector of [4 x u32]. +/// \returns A 128-bit vector of [4 x u32] containing the lesser values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); } +/// \brief Compares the corresponding elements of two 128-bit vectors of +/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the +/// greater value of the two. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x u32]. +/// \param __V2 +/// A 128-bit vector of [4 x u32]. +/// \returns A 128-bit vector of [4 x u32] containing the greater values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32 (__m128i __V1, __m128i __V2) { @@ -196,7 +828,70 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) } /* SSE4 Insertion and Extraction from XMM Register Instructions. */ +/// \brief Takes the first argument \a X and inserts an element from the second +/// argument \a Y as selected by the third argument \a N. That result then +/// has elements zeroed out also as selected by the third argument \a N. The +/// resulting 128-bit vector of [4 x float] is then returned. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. +/// +/// \param X +/// A 128-bit vector source operand of [4 x float]. With the exception of +/// those bits in the result copied from parameter \a Y and zeroed by bits +/// [3:0] of \a N, all bits from this parameter are copied to the result. +/// \param Y +/// A 128-bit vector source operand of [4 x float]. One single-precision +/// floating-point element from this source, as determined by the immediate +/// parameter, is copied to the result. +/// \param N +/// Specifies which bits from operand \a Y will be copied, which bits in the +/// result they will be be copied to, and which bits in the result will be +/// cleared. The following assignments are made: \n +/// Bits [7:6] specify the bits to copy from operand \a Y: \n +/// 00: Selects bits [31:0] from operand \a Y. \n +/// 01: Selects bits [63:32] from operand \a Y. \n +/// 10: Selects bits [95:64] from operand \a Y. \n +/// 11: Selects bits [127:96] from operand \a Y. \n +/// Bits [5:4] specify the bits in the result to which the selected bits +/// from operand \a Y are copied: \n +/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n +/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n +/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n +/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n +/// Bits[3:0]: If any of these bits are set, the corresponding result +/// element is cleared. +/// \returns A 128-bit vector of [4 x float] containing the copied single- +/// precision floating point elements from the operands. #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) + +/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and +/// returns it, using the immediate value parameter \a N as a selector. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_extract_ps(__m128 X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> +/// instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. +/// \param N +/// An immediate value. Bits [1:0] determines which bits from the argument +/// \a X are extracted and returned: \n +/// 00: Bits [31:0] of parameter \a X are returned. \n +/// 01: Bits [63:32] of parameter \a X are returned. \n +/// 10: Bits [95:64] of parameter \a X are returned. \n +/// 11: Bits [127:96] of parameter \a X are returned. +/// \returns A 32-bit integer containing the extracted 32 bits of float data. #define _mm_extract_ps(X, N) (__extension__ \ ({ union { int __i; float __f; } __t; \ __v4sf __a = (__v4sf)(__m128)(X); \ @@ -217,15 +912,111 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) /* Insert int into packed integer array at index. */ +/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of +/// the 128-bit integer vector parameter, and then inserting the lower 8 bits +/// of an integer parameter \a I into an offset specified by the immediate +/// value parameter \a N. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_insert_epi8(__m128i X, int I, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. +/// +/// \param X +/// A 128-bit integer vector of [16 x i8]. This vector is copied to the +/// result and then one of the sixteen elements in the result vector is +/// replaced by the lower 8 bits of \a I. +/// \param I +/// An integer. The lower 8 bits of this operand are written to the result +/// beginning at the offset specified by \a N. +/// \param N +/// An immediate value. Bits [3:0] specify the bit offset in the result at +/// which the lower 8 bits of \a I are written. \n +/// 0000: Bits [7:0] of the result are used for insertion. \n +/// 0001: Bits [15:8] of the result are used for insertion. \n +/// 0010: Bits [23:16] of the result are used for insertion. \n +/// 0011: Bits [31:24] of the result are used for insertion. \n +/// 0100: Bits [39:32] of the result are used for insertion. \n +/// 0101: Bits [47:40] of the result are used for insertion. \n +/// 0110: Bits [55:48] of the result are used for insertion. \n +/// 0111: Bits [63:56] of the result are used for insertion. \n +/// 1000: Bits [71:64] of the result are used for insertion. \n +/// 1001: Bits [79:72] of the result are used for insertion. \n +/// 1010: Bits [87:80] of the result are used for insertion. \n +/// 1011: Bits [95:88] of the result are used for insertion. \n +/// 1100: Bits [103:96] of the result are used for insertion. \n +/// 1101: Bits [111:104] of the result are used for insertion. \n +/// 1110: Bits [119:112] of the result are used for insertion. \n +/// 1111: Bits [127:120] of the result are used for insertion. +/// \returns A 128-bit integer vector containing the constructed values. #define _mm_insert_epi8(X, I, N) (__extension__ \ ({ __v16qi __a = (__v16qi)(__m128i)(X); \ __a[(N) & 15] = (I); \ (__m128i)__a;})) + +/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of +/// the 128-bit integer vector parameter, and then inserting the 32-bit +/// integer parameter \a I at the offset specified by the immediate value +/// parameter \a N. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_insert_epi32(__m128i X, int I, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. +/// +/// \param X +/// A 128-bit integer vector of [4 x i32]. This vector is copied to the +/// result and then one of the four elements in the result vector is +/// replaced by \a I. +/// \param I +/// A 32-bit integer that is written to the result beginning at the offset +/// specified by \a N. +/// \param N +/// An immediate value. Bits [1:0] specify the bit offset in the result at +/// which the integer \a I is written. \n +/// 00: Bits [31:0] of the result are used for insertion. \n +/// 01: Bits [63:32] of the result are used for insertion. \n +/// 10: Bits [95:64] of the result are used for insertion. \n +/// 11: Bits [127:96] of the result are used for insertion. +/// \returns A 128-bit integer vector containing the constructed values. #define _mm_insert_epi32(X, I, N) (__extension__ \ ({ __v4si __a = (__v4si)(__m128i)(X); \ __a[(N) & 3] = (I); \ (__m128i)__a;})) + #ifdef __x86_64__ +/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of +/// the 128-bit integer vector parameter, and then inserting the 64-bit +/// integer parameter \a I, using the immediate value parameter \a N as an +/// insertion location selector. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. +/// +/// \param X +/// A 128-bit integer vector of [2 x i64]. This vector is copied to the +/// result and then one of the two elements in the result vector is replaced +/// by \a I. +/// \param I +/// A 64-bit integer that is written to the result beginning at the offset +/// specified by \a N. +/// \param N +/// An immediate value. Bit [0] specifies the bit offset in the result at +/// which the integer \a I is written. \n +/// 0: Bits [63:0] of the result are used for insertion. \n +/// 1: Bits [127:64] of the result are used for insertion. \n +/// \returns A 128-bit integer vector containing the constructed values. #define _mm_insert_epi64(X, I, N) (__extension__ \ ({ __v2di __a = (__v2di)(__m128i)(X); \ __a[(N) & 1] = (I); \ @@ -235,42 +1026,219 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /* Extract int from packed integer array at index. This returns the element * as a zero extended value, so it is unsigned. */ +/// \brief Extracts an 8-bit element from the 128-bit integer vector of +/// [16 x i8], using the immediate value parameter \a N as a selector. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_extract_epi8(__m128i X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. +/// +/// \param X +/// A 128-bit integer vector. +/// \param N +/// An immediate value. Bits [3:0] specify which 8-bit vector element from +/// the argument \a X to extract and copy to the result. \n +/// 0000: Bits [7:0] of parameter \a X are extracted. \n +/// 0001: Bits [15:8] of the parameter \a X are extracted. \n +/// 0010: Bits [23:16] of the parameter \a X are extracted. \n +/// 0011: Bits [31:24] of the parameter \a X are extracted. \n +/// 0100: Bits [39:32] of the parameter \a X are extracted. \n +/// 0101: Bits [47:40] of the parameter \a X are extracted. \n +/// 0110: Bits [55:48] of the parameter \a X are extracted. \n +/// 0111: Bits [63:56] of the parameter \a X are extracted. \n +/// 1000: Bits [71:64] of the parameter \a X are extracted. \n +/// 1001: Bits [79:72] of the parameter \a X are extracted. \n +/// 1010: Bits [87:80] of the parameter \a X are extracted. \n +/// 1011: Bits [95:88] of the parameter \a X are extracted. \n +/// 1100: Bits [103:96] of the parameter \a X are extracted. \n +/// 1101: Bits [111:104] of the parameter \a X are extracted. \n +/// 1110: Bits [119:112] of the parameter \a X are extracted. \n +/// 1111: Bits [127:120] of the parameter \a X are extracted. +/// \returns An unsigned integer, whose lower 8 bits are selected from the +/// 128-bit integer vector parameter and the remaining bits are assigned +/// zeros. #define _mm_extract_epi8(X, N) (__extension__ \ ({ __v16qi __a = (__v16qi)(__m128i)(X); \ (int)(unsigned char) __a[(N) & 15];})) + +/// \brief Extracts a 32-bit element from the 128-bit integer vector of +/// [4 x i32], using the immediate value parameter \a N as a selector. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_extract_epi32(__m128i X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. +/// +/// \param X +/// A 128-bit integer vector. +/// \param N +/// An immediate value. Bits [1:0] specify which 32-bit vector element from +/// the argument \a X to extract and copy to the result. \n +/// 00: Bits [31:0] of the parameter \a X are extracted. \n +/// 01: Bits [63:32] of the parameter \a X are extracted. \n +/// 10: Bits [95:64] of the parameter \a X are extracted. \n +/// 11: Bits [127:96] of the parameter \a X are exracted. +/// \returns An integer, whose lower 32 bits are selected from the 128-bit +/// integer vector parameter and the remaining bits are assigned zeros. #define _mm_extract_epi32(X, N) (__extension__ \ ({ __v4si __a = (__v4si)(__m128i)(X); \ (int)__a[(N) & 3];})) + #ifdef __x86_64__ +/// \brief Extracts a 64-bit element from the 128-bit integer vector of +/// [2 x i64], using the immediate value parameter \a N as a selector. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// long long _mm_extract_epi64(__m128i X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. +/// +/// \param X +/// A 128-bit integer vector. +/// \param N +/// An immediate value. Bit [0] specifies which 64-bit vector element from +/// the argument \a X to return. \n +/// 0: Bits [63:0] are returned. \n +/// 1: Bits [127:64] are returned. \n +/// \returns A 64-bit integer. #define _mm_extract_epi64(X, N) (__extension__ \ ({ __v2di __a = (__v2di)(__m128i)(X); \ (long long)__a[(N) & 1];})) #endif /* __x86_64 */ /* SSE4 128-bit Packed Integer Comparisons. */ +/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// zeros. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. +/// +/// \param __M +/// A 128-bit integer vector containing the bits to be tested. +/// \param __V +/// A 128-bit integer vector selecting which bits to test in operand \a __M. +/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, __m128i __V) { return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); } +/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// ones. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. +/// +/// \param __M +/// A 128-bit integer vector containing the bits to be tested. +/// \param __V +/// A 128-bit integer vector selecting which bits to test in operand \a __M. +/// \returns TRUE if the specified bits are all ones; FALSE otherwise. static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, __m128i __V) { return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); } +/// \brief Tests whether the specified bits in a 128-bit integer vector are +/// neither all zeros nor all ones. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. +/// +/// \param __M +/// A 128-bit integer vector containing the bits to be tested. +/// \param __V +/// A 128-bit integer vector selecting which bits to test in operand \a __M. +/// \returns TRUE if the specified bits are neither all zeros nor all ones; +/// FALSE otherwise. static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, __m128i __V) { return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); } +/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// ones. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_test_all_ones(__m128i V); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. +/// +/// \param V +/// A 128-bit integer vector containing the bits to be tested. +/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE +/// otherwise. #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) + +/// \brief Tests whether the specified bits in a 128-bit integer vector are +/// neither all zeros nor all ones. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. +/// +/// \param M +/// A 128-bit integer vector containing the bits to be tested. +/// \param V +/// A 128-bit integer vector selecting which bits to test in operand \a M. +/// \returns TRUE if the specified bits are neither all zeros nor all ones; +/// FALSE otherwise. #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) + +/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// zeros. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_test_all_zeros(__m128i M, __m128i V); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. +/// +/// \param M +/// A 128-bit integer vector containing the bits to be tested. +/// \param V +/// A 128-bit integer vector selecting which bits to test in operand \a M. +/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) /* SSE4 64-bit Packed Integer Comparisons. */ +/// \brief Compares each of the corresponding 64-bit values of the 128-bit +/// integer vectors for equality. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. +/// +/// \param __V1 +/// A 128-bit integer vector. +/// \param __V2 +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) { @@ -278,6 +1246,19 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) } /* SSE4 Packed Integer Sign-Extension. */ +/// \brief Sign-extends each of the lower eight 8-bit integer elements of a +/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a +/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector +/// are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- +/// extended to 16-bit values. +/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { @@ -286,6 +1267,19 @@ _mm_cvtepi8_epi16(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); } +/// \brief Sign-extends each of the lower four 8-bit integer elements of a +/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a +/// 128-bit vector of [4 x i32]. The upper twelve elements of the input +/// vector are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign- +/// extended to 32-bit values. +/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) { @@ -294,6 +1288,19 @@ _mm_cvtepi8_epi32(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); } +/// \brief Sign-extends each of the lower two 8-bit integer elements of a +/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input +/// vector are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign- +/// extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { @@ -302,18 +1309,57 @@ _mm_cvtepi8_epi64(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); } +/// \brief Sign-extends each of the lower four 16-bit integer elements of a +/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in +/// a 128-bit vector of [4 x i32]. The upper four elements of the input +/// vector are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign- +/// extended to 32-bit values. +/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); } +/// \brief Sign-extends each of the lower two 16-bit integer elements of a +/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper six elements of the input +/// vector are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign- +/// extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); } +/// \brief Sign-extends each of the lower two 32-bit integer elements of a +/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector +/// are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign- +/// extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { @@ -321,36 +1367,114 @@ _mm_cvtepi32_epi64(__m128i __V) } /* SSE4 Packed Integer Zero-Extension. */ +/// \brief Zero-extends each of the lower eight 8-bit integer elements of a +/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a +/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector +/// are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero- +/// extended to 16-bit values. +/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) { return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); } +/// \brief Zero-extends each of the lower four 8-bit integer elements of a +/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a +/// 128-bit vector of [4 x i32]. The upper twelve elements of the input +/// vector are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero- +/// extended to 32-bit values. +/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) { return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); } +/// \brief Zero-extends each of the lower two 8-bit integer elements of a +/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input +/// vector are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero- +/// extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) { return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); } +/// \brief Zero-extends each of the lower four 16-bit integer elements of a +/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in +/// a 128-bit vector of [4 x i32]. The upper four elements of the input +/// vector are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero- +/// extended to 32-bit values. +/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) { return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); } +/// \brief Zero-extends each of the lower two 16-bit integer elements of a +/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector +/// are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero- +/// extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) { return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); } +/// \brief Zero-extends each of the lower two 32-bit integer elements of a +/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector +/// are unused. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. +/// +/// \param __V +/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero- +/// extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) { @@ -358,6 +1482,28 @@ _mm_cvtepu32_epi64(__m128i __V) } /* SSE4 Pack with Unsigned Saturation. */ +/// \brief Converts 32-bit signed integers from both 128-bit integer vector +/// operands into 16-bit unsigned integers, and returns the packed result. +/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than +/// 0x0000 are saturated to 0x0000. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a +/// signed integer and is converted to a 16-bit unsigned integer with +/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values +/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values +/// are written to the lower 64 bits of the result. +/// \param __V2 +/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a +/// signed integer and is converted to a 16-bit unsigned integer with +/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values +/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values +/// are written to the higher 64 bits of the result. +/// \returns A 128-bit vector of [8 x i16] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, __m128i __V2) { @@ -365,10 +1511,58 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2) } /* SSE4 Multiple Packed Sums of Absolute Difference. */ +/// \brief Subtracts 8-bit unsigned integer values and computes the absolute +/// values of the differences to the corresponding bits in the destination. +/// Then sums of the absolute differences are returned according to the bit +/// fields in the immediate operand. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. +/// +/// \param X +/// A 128-bit vector of [16 x i8]. +/// \param Y +/// A 128-bit vector of [16 x i8]. +/// \param M +/// An 8-bit immediate operand specifying how the absolute differences are to +/// be calculated, according to the following algorithm: +/// \code +/// // M2 represents bit 2 of the immediate operand +/// // M10 represents bits [1:0] of the immediate operand +/// i = M2 * 4 +/// j = M10 * 4 +/// for (k = 0; k < 8; k = k + 1) { +/// d0 = abs(X[i + k + 0] - Y[j + 0]) +/// d1 = abs(X[i + k + 1] - Y[j + 1]) +/// d2 = abs(X[i + k + 2] - Y[j + 2]) +/// d3 = abs(X[i + k + 3] - Y[j + 3]) +/// r[k] = d0 + d1 + d2 + d3 +/// } +/// \endcode +/// \returns A 128-bit integer vector containing the sums of the sets of +/// absolute differences between both operands. #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ (__v16qi)(__m128i)(Y), (M)); }) +/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit +/// vector of [8 x u16] and returns it and along with its index. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> +/// instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x u16]. +/// \returns A 128-bit value where bits [15:0] contain the minimum value found +/// in parameter \a __V, bits [18:16] contain the index of the minimum value +/// and the remaining bits are set to 0. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) { @@ -410,61 +1604,769 @@ _mm_minpos_epu16(__m128i __V) #define _SIDD_UNIT_MASK 0x40 /* SSE4.2 Packed Comparison Intrinsics. */ +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns a 128-bit integer vector representing the result +/// mask of the comparison. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words, the type of comparison to perform, and the format of the return +/// value. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 +/// bytes. \n +/// 0: The result is zero-extended to 16 bytes. \n +/// 1: The result is expanded to 16 bytes (this expansion is performed by +/// repeating each bit 8 or 16 times). +/// \returns Returns a 128-bit integer vector representing the result mask of +/// the comparison. #define _mm_cmpistrm(A, B, M) \ (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns an integer representing the result index of the +/// comparison. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpistri(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words, the type of comparison to perform, and the format of the return +/// value. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// Bit [6]: Determines whether the index of the lowest set bit or the +/// highest set bit is returned. \n +/// 0: The index of the least significant set bit. \n +/// 1: The index of the most significant set bit. \n +/// \returns Returns an integer representing the result index of the comparison. #define _mm_cmpistri(A, B, M) \ (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns a 128-bit integer vector representing the result +/// mask of the comparison. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words, the type of comparison to perform, and the format of the return +/// value. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 +/// bytes. \n +/// 0: The result is zero-extended to 16 bytes. \n +/// 1: The result is expanded to 16 bytes (this expansion is performed by +/// repeating each bit 8 or 16 times). \n +/// \returns Returns a 128-bit integer vector representing the result mask of +/// the comparison. #define _mm_cmpestrm(A, LA, B, LB, M) \ (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns an integer representing the result index of the +/// comparison. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words, the type of comparison to perform, and the format of the return +/// value. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// Bit [6]: Determines whether the index of the lowest set bit or the +/// highest set bit is returned. \n +/// 0: The index of the least significant set bit. \n +/// 1: The index of the most significant set bit. \n +/// \returns Returns an integer representing the result index of the comparison. #define _mm_cmpestri(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the +/// string in \a B is the maximum, otherwise, returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpistra(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns 1 if the bit mask is zero and the length of the string in +/// \a B is the maximum; otherwise, returns 0. #define _mm_cmpistra(A, B, M) \ (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns +/// 0. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpistrc(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. #define _mm_cmpistrc(A, B, M) \ (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns bit 0 of the resulting bit mask. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpistro(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns bit 0 of the resulting bit mask. #define _mm_cmpistro(A, B, M) \ (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the length of the string in \a A is less than +/// the maximum, otherwise, returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpistrs(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns 1 if the length of the string in \a A is less than the +/// maximum, otherwise, returns 0. #define _mm_cmpistrs(A, B, M) \ (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the length of the string in \a B is less than +/// the maximum, otherwise, returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpistrz(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns 1 if the length of the string in \a B is less than the +/// maximum, otherwise, returns 0. #define _mm_cmpistrz(A, B, M) \ (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the +/// string in \a B is the maximum, otherwise, returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns 1 if the bit mask is zero and the length of the string in +/// \a B is the maximum, otherwise, returns 0. #define _mm_cmpestra(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, +/// returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. #define _mm_cmpestrc(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns bit 0 of the resulting bit mask. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns bit 0 of the resulting bit mask. #define _mm_cmpestro(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the length of the string in \a A is less than +/// the maximum, otherwise, returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement in the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns 1 if the length of the string in \a A is less than the +/// maximum, otherwise, returns 0. #define _mm_cmpestrs(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) + +/// \brief Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the length of the string in \a B is less than +/// the maximum, otherwise, returns 0. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns 1 if the length of the string in \a B is less than the +/// maximum, otherwise, returns 0. #define _mm_cmpestrz(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) /* SSE4.2 Compare Packed Data -- Greater Than. */ +/// \brief Compares each of the corresponding 64-bit values of the 128-bit +/// integer vectors to determine if the values in the first operand are +/// greater than those in the second operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. +/// +/// \param __V1 +/// A 128-bit integer vector. +/// \param __V2 +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) { @@ -472,18 +2374,60 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) } /* SSE4.2 Accumulate CRC32. */ +/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// unsigned char operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> CRC32B </c> instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum of operand +/// \a __D. +/// \param __D +/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum. +/// \returns The result of adding operand \a __C to the CRC-32C checksum of +/// operand \a __D. static __inline__ unsigned int __DEFAULT_FN_ATTRS _mm_crc32_u8(unsigned int __C, unsigned char __D) { return __builtin_ia32_crc32qi(__C, __D); } +/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// unsigned short operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> CRC32W </c> instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum of operand +/// \a __D. +/// \param __D +/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum. +/// \returns The result of adding operand \a __C to the CRC-32C checksum of +/// operand \a __D. static __inline__ unsigned int __DEFAULT_FN_ATTRS _mm_crc32_u16(unsigned int __C, unsigned short __D) { return __builtin_ia32_crc32hi(__C, __D); } +/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of +/// the second unsigned integer operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> CRC32L </c> instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum of operand +/// \a __D. +/// \param __D +/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum. +/// \returns The result of adding operand \a __C to the CRC-32C checksum of +/// operand \a __D. static __inline__ unsigned int __DEFAULT_FN_ATTRS _mm_crc32_u32(unsigned int __C, unsigned int __D) { @@ -491,6 +2435,20 @@ _mm_crc32_u32(unsigned int __C, unsigned int __D) } #ifdef __x86_64__ +/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// unsigned 64-bit integer operand. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> CRC32Q </c> instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum of operand +/// \a __D. +/// \param __D +/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum. +/// \returns The result of adding operand \a __C to the CRC-32C checksum of +/// operand \a __D. static __inline__ unsigned long long __DEFAULT_FN_ATTRS _mm_crc32_u64(unsigned long long __C, unsigned long long __D) { diff --git a/contrib/llvm/tools/clang/lib/Headers/stdarg.h b/contrib/llvm/tools/clang/lib/Headers/stdarg.h index a57e183..101426f 100644 --- a/contrib/llvm/tools/clang/lib/Headers/stdarg.h +++ b/contrib/llvm/tools/clang/lib/Headers/stdarg.h @@ -43,10 +43,9 @@ typedef __builtin_va_list va_list; #define va_copy(dest, src) __builtin_va_copy(dest, src) #endif -/* Hack required to make standard headers work, at least on Ubuntu */ #ifndef __GNUC_VA_LIST #define __GNUC_VA_LIST 1 -#endif typedef __builtin_va_list __gnuc_va_list; +#endif #endif /* __STDARG_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/stdatomic.h b/contrib/llvm/tools/clang/lib/Headers/stdatomic.h index 23bb3a3..b4845a7 100644 --- a/contrib/llvm/tools/clang/lib/Headers/stdatomic.h +++ b/contrib/llvm/tools/clang/lib/Headers/stdatomic.h @@ -40,16 +40,16 @@ extern "C" { /* 7.17.1 Introduction */ -#define ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE -#define ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE -#define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE -#define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE -#define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE -#define ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE -#define ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE -#define ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE -#define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE -#define ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE +#define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE +#define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE +#define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE +#define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE +#define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE +#define ATOMIC_SHORT_LOCK_FREE __CLANG_ATOMIC_SHORT_LOCK_FREE +#define ATOMIC_INT_LOCK_FREE __CLANG_ATOMIC_INT_LOCK_FREE +#define ATOMIC_LONG_LOCK_FREE __CLANG_ATOMIC_LONG_LOCK_FREE +#define ATOMIC_LLONG_LOCK_FREE __CLANG_ATOMIC_LLONG_LOCK_FREE +#define ATOMIC_POINTER_LOCK_FREE __CLANG_ATOMIC_POINTER_LOCK_FREE /* 7.17.2 Initialization */ diff --git a/contrib/llvm/tools/clang/lib/Headers/stdint.h b/contrib/llvm/tools/clang/lib/Headers/stdint.h index 3f2fcbc..c488153 100644 --- a/contrib/llvm/tools/clang/lib/Headers/stdint.h +++ b/contrib/llvm/tools/clang/lib/Headers/stdint.h @@ -255,19 +255,16 @@ typedef __uint_least8_t uint_fast8_t; */ #define __stdint_join3(a,b,c) a ## b ## c -#define __intn_t(n) __stdint_join3( int, n, _t) -#define __uintn_t(n) __stdint_join3(uint, n, _t) - #ifndef _INTPTR_T #ifndef __intptr_t_defined -typedef __intn_t(__INTPTR_WIDTH__) intptr_t; +typedef __INTPTR_TYPE__ intptr_t; #define __intptr_t_defined #define _INTPTR_T #endif #endif #ifndef _UINTPTR_T -typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t; +typedef __UINTPTR_TYPE__ uintptr_t; #define _UINTPTR_T #endif @@ -659,12 +656,12 @@ typedef __UINTMAX_TYPE__ uintmax_t; /* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */ /* C99 7.18.3 Limits of other integer types. */ -#define INTPTR_MIN __INTN_MIN(__INTPTR_WIDTH__) -#define INTPTR_MAX __INTN_MAX(__INTPTR_WIDTH__) -#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__) -#define PTRDIFF_MIN __INTN_MIN(__PTRDIFF_WIDTH__) -#define PTRDIFF_MAX __INTN_MAX(__PTRDIFF_WIDTH__) -#define SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__) +#define INTPTR_MIN (-__INTPTR_MAX__-1) +#define INTPTR_MAX __INTPTR_MAX__ +#define UINTPTR_MAX __UINTPTR_MAX__ +#define PTRDIFF_MIN (-__PTRDIFF_MAX__-1) +#define PTRDIFF_MAX __PTRDIFF_MAX__ +#define SIZE_MAX __SIZE_MAX__ /* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__ * is enabled. */ @@ -673,9 +670,9 @@ typedef __UINTMAX_TYPE__ uintmax_t; #endif /* C99 7.18.2.5 Limits of greatest-width integer types. */ -#define INTMAX_MIN __INTN_MIN(__INTMAX_WIDTH__) -#define INTMAX_MAX __INTN_MAX(__INTMAX_WIDTH__) -#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__) +#define INTMAX_MIN (-__INTMAX_MAX__-1) +#define INTMAX_MAX __INTMAX_MAX__ +#define UINTMAX_MAX __UINTMAX_MAX__ /* C99 7.18.3 Limits of other integer types. */ #define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__) @@ -700,8 +697,8 @@ typedef __UINTMAX_TYPE__ uintmax_t; #endif /* 7.18.4.2 Macros for greatest-width integer constants. */ -#define INTMAX_C(v) __INTN_C(__INTMAX_WIDTH__, v) -#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v) +#define INTMAX_C(v) __int_c(v, __INTMAX_C_SUFFIX__) +#define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__) #endif /* __STDC_HOSTED__ */ #endif /* __CLANG_STDINT_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/tgmath.h b/contrib/llvm/tools/clang/lib/Headers/tgmath.h index 318e118..34e26dc 100644 --- a/contrib/llvm/tools/clang/lib/Headers/tgmath.h +++ b/contrib/llvm/tools/clang/lib/Headers/tgmath.h @@ -22,12 +22,21 @@ * \*===----------------------------------------------------------------------===*/ -#ifndef __TGMATH_H -#define __TGMATH_H +#ifndef __CLANG_TGMATH_H +#define __CLANG_TGMATH_H /* C99 7.22 Type-generic math <tgmath.h>. */ #include <math.h> +/* + * Allow additional definitions and implementation-defined values on Apple + * platforms. This is done after #include <math.h> to avoid depcycle conflicts + * between libcxx and darwin in C++ modules builds. + */ +#if defined(__APPLE__) && __STDC_HOSTED__ && __has_include_next(<tgmath.h>) +# include_next <tgmath.h> +#else + /* C++ handles type genericity with overloading in math.h. */ #ifndef __cplusplus #include <complex.h> @@ -1371,4 +1380,5 @@ static long double #undef _TG_ATTRS #endif /* __cplusplus */ -#endif /* __TGMATH_H */ +#endif /* __has_include_next */ +#endif /* __CLANG_TGMATH_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/tmmintrin.h b/contrib/llvm/tools/clang/lib/Headers/tmmintrin.h index 8066404..042bfc7 100644 --- a/contrib/llvm/tools/clang/lib/Headers/tmmintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/tmmintrin.h @@ -469,10 +469,11 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b) /// values contained in the first source operand and packed 8-bit signed /// integer values contained in the second source operand, adds pairs of /// contiguous products with signed saturation, and writes the 16-bit sums to -/// the corresponding bits in the destination. For example, bits [7:0] of -/// both operands are multiplied, bits [15:8] of both operands are -/// multiplied, and the sum of both results is written to bits [15:0] of the -/// destination. +/// the corresponding bits in the destination. +/// +/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of +/// both operands are multiplied, and the sum of both results is written to +/// bits [15:0] of the destination. /// /// \headerfile <x86intrin.h> /// @@ -502,10 +503,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) /// values contained in the first source operand and packed 8-bit signed /// integer values contained in the second source operand, adds pairs of /// contiguous products with signed saturation, and writes the 16-bit sums to -/// the corresponding bits in the destination. For example, bits [7:0] of -/// both operands are multiplied, bits [15:8] of both operands are -/// multiplied, and the sum of both results is written to bits [15:0] of the -/// destination. +/// the corresponding bits in the destination. +/// +/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of +/// both operands are multiplied, and the sum of both results is written to +/// bits [15:0] of the destination. /// /// \headerfile <x86intrin.h> /// @@ -619,13 +621,14 @@ _mm_shuffle_pi8(__m64 __a, __m64 __b) } /// \brief For each 8-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand: If the -/// byte in the second source is negative, calculate the two's complement of -/// the corresponding byte in the first source, and write that value to the -/// destination. If the byte in the second source is positive, copy the -/// corresponding byte from the first source to the destination. If the byte -/// in the second source is zero, clear the corresponding byte in the -/// destination. +/// the following actions as specified by the second source operand. +/// +/// If the byte in the second source is negative, calculate the two's +/// complement of the corresponding byte in the first source, and write that +/// value to the destination. If the byte in the second source is positive, +/// copy the corresponding byte from the first source to the destination. If +/// the byte in the second source is zero, clear the corresponding byte in +/// the destination. /// /// \headerfile <x86intrin.h> /// @@ -644,13 +647,14 @@ _mm_sign_epi8(__m128i __a, __m128i __b) } /// \brief For each 16-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand: If the -/// word in the second source is negative, calculate the two's complement of -/// the corresponding word in the first source, and write that value to the -/// destination. If the word in the second source is positive, copy the -/// corresponding word from the first source to the destination. If the word -/// in the second source is zero, clear the corresponding word in the -/// destination. +/// the following actions as specified by the second source operand. +/// +/// If the word in the second source is negative, calculate the two's +/// complement of the corresponding word in the first source, and write that +/// value to the destination. If the word in the second source is positive, +/// copy the corresponding word from the first source to the destination. If +/// the word in the second source is zero, clear the corresponding word in +/// the destination. /// /// \headerfile <x86intrin.h> /// @@ -669,8 +673,9 @@ _mm_sign_epi16(__m128i __a, __m128i __b) } /// \brief For each 32-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand: If the -/// doubleword in the second source is negative, calculate the two's +/// the following actions as specified by the second source operand. +/// +/// If the doubleword in the second source is negative, calculate the two's /// complement of the corresponding word in the first source, and write that /// value to the destination. If the doubleword in the second source is /// positive, copy the corresponding word from the first source to the @@ -694,13 +699,14 @@ _mm_sign_epi32(__m128i __a, __m128i __b) } /// \brief For each 8-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand: If the -/// byte in the second source is negative, calculate the two's complement of -/// the corresponding byte in the first source, and write that value to the -/// destination. If the byte in the second source is positive, copy the -/// corresponding byte from the first source to the destination. If the byte -/// in the second source is zero, clear the corresponding byte in the -/// destination. +/// the following actions as specified by the second source operand. +/// +/// If the byte in the second source is negative, calculate the two's +/// complement of the corresponding byte in the first source, and write that +/// value to the destination. If the byte in the second source is positive, +/// copy the corresponding byte from the first source to the destination. If +/// the byte in the second source is zero, clear the corresponding byte in +/// the destination. /// /// \headerfile <x86intrin.h> /// @@ -719,13 +725,14 @@ _mm_sign_pi8(__m64 __a, __m64 __b) } /// \brief For each 16-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand: If the -/// word in the second source is negative, calculate the two's complement of -/// the corresponding word in the first source, and write that value to the -/// destination. If the word in the second source is positive, copy the -/// corresponding word from the first source to the destination. If the word -/// in the second source is zero, clear the corresponding word in the -/// destination. +/// the following actions as specified by the second source operand. +/// +/// If the word in the second source is negative, calculate the two's +/// complement of the corresponding word in the first source, and write that +/// value to the destination. If the word in the second source is positive, +/// copy the corresponding word from the first source to the destination. If +/// the word in the second source is zero, clear the corresponding word in +/// the destination. /// /// \headerfile <x86intrin.h> /// @@ -744,8 +751,9 @@ _mm_sign_pi16(__m64 __a, __m64 __b) } /// \brief For each 32-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand: If the -/// doubleword in the second source is negative, calculate the two's +/// the following actions as specified by the second source operand. +/// +/// If the doubleword in the second source is negative, calculate the two's /// complement of the corresponding doubleword in the first source, and /// write that value to the destination. If the doubleword in the second /// source is positive, copy the corresponding doubleword from the first diff --git a/contrib/llvm/tools/clang/lib/Headers/vecintrin.h b/contrib/llvm/tools/clang/lib/Headers/vecintrin.h index ca7acb4..f7061e8 100644 --- a/contrib/llvm/tools/clang/lib/Headers/vecintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/vecintrin.h @@ -116,6 +116,13 @@ vec_extract(vector unsigned long long __vec, int __index) { return __vec[__index & 1]; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai float +vec_extract(vector float __vec, int __index) { + return __vec[__index & 3]; +} +#endif + static inline __ATTRS_o_ai double vec_extract(vector double __vec, int __index) { return __vec[__index & 1]; @@ -129,6 +136,7 @@ vec_insert(signed char __scalar, vector signed char __vec, int __index) { return __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_insert(unsigned char __scalar, vector bool char __vec, int __index) { vector unsigned char __newvec = (vector unsigned char)__vec; @@ -148,6 +156,7 @@ vec_insert(signed short __scalar, vector signed short __vec, int __index) { return __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_insert(unsigned short __scalar, vector bool short __vec, int __index) { vector unsigned short __newvec = (vector unsigned short)__vec; @@ -167,6 +176,7 @@ vec_insert(signed int __scalar, vector signed int __vec, int __index) { return __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_insert(unsigned int __scalar, vector bool int __vec, int __index) { vector unsigned int __newvec = (vector unsigned int)__vec; @@ -187,6 +197,7 @@ vec_insert(signed long long __scalar, vector signed long long __vec, return __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_insert(unsigned long long __scalar, vector bool long long __vec, int __index) { @@ -202,6 +213,14 @@ vec_insert(unsigned long long __scalar, vector unsigned long long __vec, return __vec; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_insert(float __scalar, vector float __vec, int __index) { + __vec[__index & 1] = __scalar; + return __vec; +} +#endif + static inline __ATTRS_o_ai vector double vec_insert(double __scalar, vector double __vec, int __index) { __vec[__index & 1] = __scalar; @@ -282,6 +301,16 @@ vec_promote(unsigned long long __scalar, int __index) { return __vec; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_promote(float __scalar, int __index) { + const vector float __zero = (vector float)0; + vector float __vec = __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1); + __vec[__index & 3] = __scalar; + return __vec; +} +#endif + static inline __ATTRS_o_ai vector double vec_promote(double __scalar, int __index) { const vector double __zero = (vector double)0; @@ -348,6 +377,15 @@ vec_insert_and_zero(const unsigned long long *__ptr) { return __vec; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_insert_and_zero(const float *__ptr) { + vector float __vec = (vector float)0; + __vec[0] = *__ptr; + return __vec; +} +#endif + static inline __ATTRS_o_ai vector double vec_insert_and_zero(const double *__ptr) { vector double __vec = (vector double)0; @@ -441,6 +479,15 @@ vec_perm(vector bool long long __a, vector bool long long __b, (vector unsigned char)__a, (vector unsigned char)__b, __c); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_perm(vector float __a, vector float __b, + vector unsigned char __c) { + return (vector float)__builtin_s390_vperm( + (vector unsigned char)__a, (vector unsigned char)__b, __c); +} +#endif + static inline __ATTRS_o_ai vector double vec_perm(vector double __a, vector double __b, vector unsigned char __c) { @@ -450,18 +497,22 @@ vec_perm(vector double __a, vector double __b, /*-- vec_permi --------------------------------------------------------------*/ +// This prototype is deprecated. extern __ATTRS_o vector signed long long vec_permi(vector signed long long __a, vector signed long long __b, int __c) __constant_range(__c, 0, 3); +// This prototype is deprecated. extern __ATTRS_o vector unsigned long long vec_permi(vector unsigned long long __a, vector unsigned long long __b, int __c) __constant_range(__c, 0, 3); +// This prototype is deprecated. extern __ATTRS_o vector bool long long vec_permi(vector bool long long __a, vector bool long long __b, int __c) __constant_range(__c, 0, 3); +// This prototype is deprecated. extern __ATTRS_o vector double vec_permi(vector double __a, vector double __b, int __c) __constant_range(__c, 0, 3); @@ -471,6 +522,15 @@ vec_permi(vector double __a, vector double __b, int __c) (vector unsigned long long)(Y), \ (((Z) & 2) << 1) | ((Z) & 1))) +/*-- vec_bperm_u128 ---------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_ai vector unsigned long long +vec_bperm_u128(vector unsigned char __a, vector unsigned char __b) { + return __builtin_s390_vbperm(__a, __b); +} +#endif + /*-- vec_sel ----------------------------------------------------------------*/ static inline __ATTRS_o_ai vector signed char @@ -614,6 +674,22 @@ vec_sel(vector unsigned long long __a, vector unsigned long long __b, (~(vector unsigned long long)__c & __a)); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_sel(vector float __a, vector float __b, vector unsigned int __c) { + return (vector float)((__c & (vector unsigned int)__b) | + (~__c & (vector unsigned int)__a)); +} + +static inline __ATTRS_o_ai vector float +vec_sel(vector float __a, vector float __b, vector bool int __c) { + vector unsigned int __ac = (vector unsigned int)__a; + vector unsigned int __bc = (vector unsigned int)__b; + vector unsigned int __cc = (vector unsigned int)__c; + return (vector float)((__cc & __bc) | (~__cc & __ac)); +} +#endif + static inline __ATTRS_o_ai vector double vec_sel(vector double __a, vector double __b, vector unsigned long long __c) { return (vector double)((__c & (vector unsigned long long)__b) | @@ -687,6 +763,17 @@ vec_gather_element(vector unsigned long long __vec, return __vec; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_gather_element(vector float __vec, vector unsigned int __offset, + const float *__ptr, int __index) + __constant_range(__index, 0, 3) { + __vec[__index] = *(const float *)( + (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]); + return __vec; +} +#endif + static inline __ATTRS_o_ai vector double vec_gather_element(vector double __vec, vector unsigned long long __offset, const double *__ptr, int __index) @@ -749,6 +836,16 @@ vec_scatter_element(vector unsigned long long __vec, __vec[__index]; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai void +vec_scatter_element(vector float __vec, vector unsigned int __offset, + float *__ptr, int __index) + __constant_range(__index, 0, 3) { + *(float *)((__INTPTR_TYPE__)__ptr + __offset[__index]) = + __vec[__index]; +} +#endif + static inline __ATTRS_o_ai void vec_scatter_element(vector double __vec, vector unsigned long long __offset, double *__ptr, int __index) @@ -757,48 +854,111 @@ vec_scatter_element(vector double __vec, vector unsigned long long __offset, __vec[__index]; } +/*-- vec_xl -----------------------------------------------------------------*/ + +static inline __ATTRS_o_ai vector signed char +vec_xl(long __offset, const signed char *__ptr) { + return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_xl(long __offset, const unsigned char *__ptr) { + return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset); +} + +static inline __ATTRS_o_ai vector signed short +vec_xl(long __offset, const signed short *__ptr) { + return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset); +} + +static inline __ATTRS_o_ai vector unsigned short +vec_xl(long __offset, const unsigned short *__ptr) { + return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset); +} + +static inline __ATTRS_o_ai vector signed int +vec_xl(long __offset, const signed int *__ptr) { + return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset); +} + +static inline __ATTRS_o_ai vector unsigned int +vec_xl(long __offset, const unsigned int *__ptr) { + return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset); +} + +static inline __ATTRS_o_ai vector signed long long +vec_xl(long __offset, const signed long long *__ptr) { + return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset); +} + +static inline __ATTRS_o_ai vector unsigned long long +vec_xl(long __offset, const unsigned long long *__ptr) { + return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset); +} + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_xl(long __offset, const float *__ptr) { + return *(const vector float *)((__INTPTR_TYPE__)__ptr + __offset); +} +#endif + +static inline __ATTRS_o_ai vector double +vec_xl(long __offset, const double *__ptr) { + return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset); +} + /*-- vec_xld2 ---------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_xld2(long __offset, const signed char *__ptr) { return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_xld2(long __offset, const unsigned char *__ptr) { return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_xld2(long __offset, const signed short *__ptr) { return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_xld2(long __offset, const unsigned short *__ptr) { return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_xld2(long __offset, const signed int *__ptr) { return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_xld2(long __offset, const unsigned int *__ptr) { return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_xld2(long __offset, const signed long long *__ptr) { return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_xld2(long __offset, const unsigned long long *__ptr) { return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector double vec_xld2(long __offset, const double *__ptr) { return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset); @@ -806,74 +966,145 @@ vec_xld2(long __offset, const double *__ptr) { /*-- vec_xlw4 ---------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_xlw4(long __offset, const signed char *__ptr) { return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_xlw4(long __offset, const unsigned char *__ptr) { return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_xlw4(long __offset, const signed short *__ptr) { return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_xlw4(long __offset, const unsigned short *__ptr) { return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_xlw4(long __offset, const signed int *__ptr) { return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_xlw4(long __offset, const unsigned int *__ptr) { return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset); } +/*-- vec_xst ----------------------------------------------------------------*/ + +static inline __ATTRS_o_ai void +vec_xst(vector signed char __vec, long __offset, signed char *__ptr) { + *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void +vec_xst(vector unsigned char __vec, long __offset, unsigned char *__ptr) { + *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void +vec_xst(vector signed short __vec, long __offset, signed short *__ptr) { + *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void +vec_xst(vector unsigned short __vec, long __offset, unsigned short *__ptr) { + *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void +vec_xst(vector signed int __vec, long __offset, signed int *__ptr) { + *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void +vec_xst(vector unsigned int __vec, long __offset, unsigned int *__ptr) { + *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void +vec_xst(vector signed long long __vec, long __offset, + signed long long *__ptr) { + *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; +} + +static inline __ATTRS_o_ai void +vec_xst(vector unsigned long long __vec, long __offset, + unsigned long long *__ptr) { + *(vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset) = + __vec; +} + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai void +vec_xst(vector float __vec, long __offset, float *__ptr) { + *(vector float *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; +} +#endif + +static inline __ATTRS_o_ai void +vec_xst(vector double __vec, long __offset, double *__ptr) { + *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; +} + /*-- vec_xstd2 --------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstd2(vector signed char __vec, long __offset, signed char *__ptr) { *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstd2(vector unsigned char __vec, long __offset, unsigned char *__ptr) { *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstd2(vector signed short __vec, long __offset, signed short *__ptr) { *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstd2(vector unsigned short __vec, long __offset, unsigned short *__ptr) { *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstd2(vector signed int __vec, long __offset, signed int *__ptr) { *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstd2(vector unsigned int __vec, long __offset, unsigned int *__ptr) { *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstd2(vector signed long long __vec, long __offset, signed long long *__ptr) { *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstd2(vector unsigned long long __vec, long __offset, unsigned long long *__ptr) { @@ -881,6 +1112,7 @@ vec_xstd2(vector unsigned long long __vec, long __offset, __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstd2(vector double __vec, long __offset, double *__ptr) { *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; @@ -888,31 +1120,37 @@ vec_xstd2(vector double __vec, long __offset, double *__ptr) { /*-- vec_xstw4 --------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstw4(vector signed char __vec, long __offset, signed char *__ptr) { *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstw4(vector unsigned char __vec, long __offset, unsigned char *__ptr) { *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstw4(vector signed short __vec, long __offset, signed short *__ptr) { *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstw4(vector unsigned short __vec, long __offset, unsigned short *__ptr) { *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstw4(vector signed int __vec, long __offset, signed int *__ptr) { *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_xstw4(vector unsigned int __vec, long __offset, unsigned int *__ptr) { *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec; @@ -952,6 +1190,12 @@ extern __ATTRS_o vector unsigned long long vec_load_bndry(const unsigned long long *__ptr, unsigned short __len) __constant_pow2_range(__len, 64, 4096); +#if __ARCH__ >= 12 +extern __ATTRS_o vector float +vec_load_bndry(const float *__ptr, unsigned short __len) + __constant_pow2_range(__len, 64, 4096); +#endif + extern __ATTRS_o vector double vec_load_bndry(const double *__ptr, unsigned short __len) __constant_pow2_range(__len, 64, 4096); @@ -1007,11 +1251,27 @@ vec_load_len(const unsigned long long *__ptr, unsigned int __len) { return (vector unsigned long long)__builtin_s390_vll(__len, __ptr); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_load_len(const float *__ptr, unsigned int __len) { + return (vector float)__builtin_s390_vll(__len, __ptr); +} +#endif + static inline __ATTRS_o_ai vector double vec_load_len(const double *__ptr, unsigned int __len) { return (vector double)__builtin_s390_vll(__len, __ptr); } +/*-- vec_load_len_r ---------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_ai vector unsigned char +vec_load_len_r(const unsigned char *__ptr, unsigned int __len) { + return (vector unsigned char)__builtin_s390_vlrl(__len, __ptr); +} +#endif + /*-- vec_store_len ----------------------------------------------------------*/ static inline __ATTRS_o_ai void @@ -1062,12 +1322,30 @@ vec_store_len(vector unsigned long long __vec, unsigned long long *__ptr, __builtin_s390_vstl((vector signed char)__vec, __len, __ptr); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai void +vec_store_len(vector float __vec, float *__ptr, + unsigned int __len) { + __builtin_s390_vstl((vector signed char)__vec, __len, __ptr); +} +#endif + static inline __ATTRS_o_ai void vec_store_len(vector double __vec, double *__ptr, unsigned int __len) { __builtin_s390_vstl((vector signed char)__vec, __len, __ptr); } +/*-- vec_store_len_r --------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_ai void +vec_store_len_r(vector unsigned char __vec, unsigned char *__ptr, + unsigned int __len) { + __builtin_s390_vstrl((vector signed char)__vec, __len, __ptr); +} +#endif + /*-- vec_load_pair ----------------------------------------------------------*/ static inline __ATTRS_o_ai vector signed long long @@ -1232,6 +1510,14 @@ vec_splat(vector unsigned long long __vec, int __index) return (vector unsigned long long)__vec[__index]; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_splat(vector float __vec, int __index) + __constant_range(__index, 0, 3) { + return (vector float)__vec[__index]; +} +#endif + static inline __ATTRS_o_ai vector double vec_splat(vector double __vec, int __index) __constant_range(__index, 0, 1) { @@ -1332,6 +1618,13 @@ vec_splats(unsigned long long __scalar) { return (vector unsigned long long)__scalar; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_splats(float __scalar) { + return (vector float)__scalar; +} +#endif + static inline __ATTRS_o_ai vector double vec_splats(double __scalar) { return (vector double)__scalar; @@ -1425,6 +1718,13 @@ vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) { return (vector unsigned long long)(__a[0], __b[0]); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_mergeh(vector float __a, vector float __b) { + return (vector float)(__a[0], __b[0], __a[1], __b[1]); +} +#endif + static inline __ATTRS_o_ai vector double vec_mergeh(vector double __a, vector double __b) { return (vector double)(__a[0], __b[0]); @@ -1501,6 +1801,13 @@ vec_mergel(vector unsigned long long __a, vector unsigned long long __b) { return (vector unsigned long long)(__a[1], __b[1]); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_mergel(vector float __a, vector float __b) { + return (vector float)(__a[2], __b[2], __a[3], __b[3]); +} +#endif + static inline __ATTRS_o_ai vector double vec_mergel(vector double __a, vector double __b) { return (vector double)(__a[1], __b[1]); @@ -1866,6 +2173,13 @@ vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) { return (vector bool long long)(__a == __b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector bool int +vec_cmpeq(vector float __a, vector float __b) { + return (vector bool int)(__a == __b); +} +#endif + static inline __ATTRS_o_ai vector bool long long vec_cmpeq(vector double __a, vector double __b) { return (vector bool long long)(__a == __b); @@ -1913,6 +2227,13 @@ vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) { return (vector bool long long)(__a >= __b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector bool int +vec_cmpge(vector float __a, vector float __b) { + return (vector bool int)(__a >= __b); +} +#endif + static inline __ATTRS_o_ai vector bool long long vec_cmpge(vector double __a, vector double __b) { return (vector bool long long)(__a >= __b); @@ -1960,6 +2281,13 @@ vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) { return (vector bool long long)(__a > __b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector bool int +vec_cmpgt(vector float __a, vector float __b) { + return (vector bool int)(__a > __b); +} +#endif + static inline __ATTRS_o_ai vector bool long long vec_cmpgt(vector double __a, vector double __b) { return (vector bool long long)(__a > __b); @@ -2007,6 +2335,13 @@ vec_cmple(vector unsigned long long __a, vector unsigned long long __b) { return (vector bool long long)(__a <= __b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector bool int +vec_cmple(vector float __a, vector float __b) { + return (vector bool int)(__a <= __b); +} +#endif + static inline __ATTRS_o_ai vector bool long long vec_cmple(vector double __a, vector double __b) { return (vector bool long long)(__a <= __b); @@ -2054,6 +2389,13 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) { return (vector bool long long)(__a < __b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector bool int +vec_cmplt(vector float __a, vector float __b) { + return (vector bool int)(__a < __b); +} +#endif + static inline __ATTRS_o_ai vector bool long long vec_cmplt(vector double __a, vector double __b) { return (vector bool long long)(__a < __b); @@ -2068,6 +2410,7 @@ vec_all_eq(vector signed char __a, vector signed char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector signed char __a, vector bool char __b) { int __cc; @@ -2075,6 +2418,7 @@ vec_all_eq(vector signed char __a, vector bool char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector bool char __a, vector signed char __b) { int __cc; @@ -2090,6 +2434,7 @@ vec_all_eq(vector unsigned char __a, vector unsigned char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector unsigned char __a, vector bool char __b) { int __cc; @@ -2098,6 +2443,7 @@ vec_all_eq(vector unsigned char __a, vector bool char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector bool char __a, vector unsigned char __b) { int __cc; @@ -2121,6 +2467,7 @@ vec_all_eq(vector signed short __a, vector signed short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector signed short __a, vector bool short __b) { int __cc; @@ -2128,6 +2475,7 @@ vec_all_eq(vector signed short __a, vector bool short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector bool short __a, vector signed short __b) { int __cc; @@ -2143,6 +2491,7 @@ vec_all_eq(vector unsigned short __a, vector unsigned short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector unsigned short __a, vector bool short __b) { int __cc; @@ -2151,6 +2500,7 @@ vec_all_eq(vector unsigned short __a, vector bool short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector bool short __a, vector unsigned short __b) { int __cc; @@ -2174,6 +2524,7 @@ vec_all_eq(vector signed int __a, vector signed int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector signed int __a, vector bool int __b) { int __cc; @@ -2181,6 +2532,7 @@ vec_all_eq(vector signed int __a, vector bool int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector bool int __a, vector signed int __b) { int __cc; @@ -2196,6 +2548,7 @@ vec_all_eq(vector unsigned int __a, vector unsigned int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector unsigned int __a, vector bool int __b) { int __cc; @@ -2204,6 +2557,7 @@ vec_all_eq(vector unsigned int __a, vector bool int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector bool int __a, vector unsigned int __b) { int __cc; @@ -2227,6 +2581,7 @@ vec_all_eq(vector signed long long __a, vector signed long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector signed long long __a, vector bool long long __b) { int __cc; @@ -2234,6 +2589,7 @@ vec_all_eq(vector signed long long __a, vector bool long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector bool long long __a, vector signed long long __b) { int __cc; @@ -2249,6 +2605,7 @@ vec_all_eq(vector unsigned long long __a, vector unsigned long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -2257,6 +2614,7 @@ vec_all_eq(vector unsigned long long __a, vector bool long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_eq(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -2273,6 +2631,15 @@ vec_all_eq(vector bool long long __a, vector bool long long __b) { return __cc == 0; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_eq(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfcesbs(__a, __b, &__cc); + return __cc == 0; +} +#endif + static inline __ATTRS_o_ai int vec_all_eq(vector double __a, vector double __b) { int __cc; @@ -2289,6 +2656,7 @@ vec_all_ne(vector signed char __a, vector signed char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector signed char __a, vector bool char __b) { int __cc; @@ -2296,6 +2664,7 @@ vec_all_ne(vector signed char __a, vector bool char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector bool char __a, vector signed char __b) { int __cc; @@ -2311,6 +2680,7 @@ vec_all_ne(vector unsigned char __a, vector unsigned char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector unsigned char __a, vector bool char __b) { int __cc; @@ -2319,6 +2689,7 @@ vec_all_ne(vector unsigned char __a, vector bool char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector bool char __a, vector unsigned char __b) { int __cc; @@ -2342,6 +2713,7 @@ vec_all_ne(vector signed short __a, vector signed short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector signed short __a, vector bool short __b) { int __cc; @@ -2349,6 +2721,7 @@ vec_all_ne(vector signed short __a, vector bool short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector bool short __a, vector signed short __b) { int __cc; @@ -2364,6 +2737,7 @@ vec_all_ne(vector unsigned short __a, vector unsigned short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector unsigned short __a, vector bool short __b) { int __cc; @@ -2372,6 +2746,7 @@ vec_all_ne(vector unsigned short __a, vector bool short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector bool short __a, vector unsigned short __b) { int __cc; @@ -2395,6 +2770,7 @@ vec_all_ne(vector signed int __a, vector signed int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector signed int __a, vector bool int __b) { int __cc; @@ -2402,6 +2778,7 @@ vec_all_ne(vector signed int __a, vector bool int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector bool int __a, vector signed int __b) { int __cc; @@ -2417,6 +2794,7 @@ vec_all_ne(vector unsigned int __a, vector unsigned int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector unsigned int __a, vector bool int __b) { int __cc; @@ -2425,6 +2803,7 @@ vec_all_ne(vector unsigned int __a, vector bool int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector bool int __a, vector unsigned int __b) { int __cc; @@ -2448,6 +2827,7 @@ vec_all_ne(vector signed long long __a, vector signed long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector signed long long __a, vector bool long long __b) { int __cc; @@ -2455,6 +2835,7 @@ vec_all_ne(vector signed long long __a, vector bool long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector bool long long __a, vector signed long long __b) { int __cc; @@ -2470,6 +2851,7 @@ vec_all_ne(vector unsigned long long __a, vector unsigned long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -2478,6 +2860,7 @@ vec_all_ne(vector unsigned long long __a, vector bool long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ne(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -2494,6 +2877,15 @@ vec_all_ne(vector bool long long __a, vector bool long long __b) { return __cc == 3; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_ne(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfcesbs(__a, __b, &__cc); + return __cc == 3; +} +#endif + static inline __ATTRS_o_ai int vec_all_ne(vector double __a, vector double __b) { int __cc; @@ -2510,6 +2902,7 @@ vec_all_ge(vector signed char __a, vector signed char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector signed char __a, vector bool char __b) { int __cc; @@ -2517,6 +2910,7 @@ vec_all_ge(vector signed char __a, vector bool char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool char __a, vector signed char __b) { int __cc; @@ -2531,6 +2925,7 @@ vec_all_ge(vector unsigned char __a, vector unsigned char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector unsigned char __a, vector bool char __b) { int __cc; @@ -2538,6 +2933,7 @@ vec_all_ge(vector unsigned char __a, vector bool char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool char __a, vector unsigned char __b) { int __cc; @@ -2545,6 +2941,7 @@ vec_all_ge(vector bool char __a, vector unsigned char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool char __a, vector bool char __b) { int __cc; @@ -2560,6 +2957,7 @@ vec_all_ge(vector signed short __a, vector signed short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector signed short __a, vector bool short __b) { int __cc; @@ -2567,6 +2965,7 @@ vec_all_ge(vector signed short __a, vector bool short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool short __a, vector signed short __b) { int __cc; @@ -2581,6 +2980,7 @@ vec_all_ge(vector unsigned short __a, vector unsigned short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector unsigned short __a, vector bool short __b) { int __cc; @@ -2588,6 +2988,7 @@ vec_all_ge(vector unsigned short __a, vector bool short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool short __a, vector unsigned short __b) { int __cc; @@ -2595,6 +2996,7 @@ vec_all_ge(vector bool short __a, vector unsigned short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool short __a, vector bool short __b) { int __cc; @@ -2610,6 +3012,7 @@ vec_all_ge(vector signed int __a, vector signed int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector signed int __a, vector bool int __b) { int __cc; @@ -2617,6 +3020,7 @@ vec_all_ge(vector signed int __a, vector bool int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool int __a, vector signed int __b) { int __cc; @@ -2631,6 +3035,7 @@ vec_all_ge(vector unsigned int __a, vector unsigned int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector unsigned int __a, vector bool int __b) { int __cc; @@ -2638,6 +3043,7 @@ vec_all_ge(vector unsigned int __a, vector bool int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool int __a, vector unsigned int __b) { int __cc; @@ -2645,6 +3051,7 @@ vec_all_ge(vector bool int __a, vector unsigned int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool int __a, vector bool int __b) { int __cc; @@ -2660,6 +3067,7 @@ vec_all_ge(vector signed long long __a, vector signed long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector signed long long __a, vector bool long long __b) { int __cc; @@ -2667,6 +3075,7 @@ vec_all_ge(vector signed long long __a, vector bool long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool long long __a, vector signed long long __b) { int __cc; @@ -2681,6 +3090,7 @@ vec_all_ge(vector unsigned long long __a, vector unsigned long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -2688,6 +3098,7 @@ vec_all_ge(vector unsigned long long __a, vector bool long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -2695,6 +3106,7 @@ vec_all_ge(vector bool long long __a, vector unsigned long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_ge(vector bool long long __a, vector bool long long __b) { int __cc; @@ -2703,6 +3115,15 @@ vec_all_ge(vector bool long long __a, vector bool long long __b) { return __cc == 3; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_ge(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchesbs(__a, __b, &__cc); + return __cc == 0; +} +#endif + static inline __ATTRS_o_ai int vec_all_ge(vector double __a, vector double __b) { int __cc; @@ -2719,6 +3140,7 @@ vec_all_gt(vector signed char __a, vector signed char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector signed char __a, vector bool char __b) { int __cc; @@ -2726,6 +3148,7 @@ vec_all_gt(vector signed char __a, vector bool char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool char __a, vector signed char __b) { int __cc; @@ -2740,6 +3163,7 @@ vec_all_gt(vector unsigned char __a, vector unsigned char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector unsigned char __a, vector bool char __b) { int __cc; @@ -2747,6 +3171,7 @@ vec_all_gt(vector unsigned char __a, vector bool char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool char __a, vector unsigned char __b) { int __cc; @@ -2754,6 +3179,7 @@ vec_all_gt(vector bool char __a, vector unsigned char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool char __a, vector bool char __b) { int __cc; @@ -2769,6 +3195,7 @@ vec_all_gt(vector signed short __a, vector signed short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector signed short __a, vector bool short __b) { int __cc; @@ -2776,6 +3203,7 @@ vec_all_gt(vector signed short __a, vector bool short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool short __a, vector signed short __b) { int __cc; @@ -2790,6 +3218,7 @@ vec_all_gt(vector unsigned short __a, vector unsigned short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector unsigned short __a, vector bool short __b) { int __cc; @@ -2797,6 +3226,7 @@ vec_all_gt(vector unsigned short __a, vector bool short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool short __a, vector unsigned short __b) { int __cc; @@ -2804,6 +3234,7 @@ vec_all_gt(vector bool short __a, vector unsigned short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool short __a, vector bool short __b) { int __cc; @@ -2819,6 +3250,7 @@ vec_all_gt(vector signed int __a, vector signed int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector signed int __a, vector bool int __b) { int __cc; @@ -2826,6 +3258,7 @@ vec_all_gt(vector signed int __a, vector bool int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool int __a, vector signed int __b) { int __cc; @@ -2840,6 +3273,7 @@ vec_all_gt(vector unsigned int __a, vector unsigned int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector unsigned int __a, vector bool int __b) { int __cc; @@ -2847,6 +3281,7 @@ vec_all_gt(vector unsigned int __a, vector bool int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool int __a, vector unsigned int __b) { int __cc; @@ -2854,6 +3289,7 @@ vec_all_gt(vector bool int __a, vector unsigned int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool int __a, vector bool int __b) { int __cc; @@ -2869,6 +3305,7 @@ vec_all_gt(vector signed long long __a, vector signed long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector signed long long __a, vector bool long long __b) { int __cc; @@ -2876,6 +3313,7 @@ vec_all_gt(vector signed long long __a, vector bool long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool long long __a, vector signed long long __b) { int __cc; @@ -2890,6 +3328,7 @@ vec_all_gt(vector unsigned long long __a, vector unsigned long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -2897,6 +3336,7 @@ vec_all_gt(vector unsigned long long __a, vector bool long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -2904,6 +3344,7 @@ vec_all_gt(vector bool long long __a, vector unsigned long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_gt(vector bool long long __a, vector bool long long __b) { int __cc; @@ -2912,6 +3353,15 @@ vec_all_gt(vector bool long long __a, vector bool long long __b) { return __cc == 0; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_gt(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchsbs(__a, __b, &__cc); + return __cc == 0; +} +#endif + static inline __ATTRS_o_ai int vec_all_gt(vector double __a, vector double __b) { int __cc; @@ -2928,6 +3378,7 @@ vec_all_le(vector signed char __a, vector signed char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector signed char __a, vector bool char __b) { int __cc; @@ -2935,6 +3386,7 @@ vec_all_le(vector signed char __a, vector bool char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool char __a, vector signed char __b) { int __cc; @@ -2949,6 +3401,7 @@ vec_all_le(vector unsigned char __a, vector unsigned char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector unsigned char __a, vector bool char __b) { int __cc; @@ -2956,6 +3409,7 @@ vec_all_le(vector unsigned char __a, vector bool char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool char __a, vector unsigned char __b) { int __cc; @@ -2963,6 +3417,7 @@ vec_all_le(vector bool char __a, vector unsigned char __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool char __a, vector bool char __b) { int __cc; @@ -2978,6 +3433,7 @@ vec_all_le(vector signed short __a, vector signed short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector signed short __a, vector bool short __b) { int __cc; @@ -2985,6 +3441,7 @@ vec_all_le(vector signed short __a, vector bool short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool short __a, vector signed short __b) { int __cc; @@ -2999,6 +3456,7 @@ vec_all_le(vector unsigned short __a, vector unsigned short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector unsigned short __a, vector bool short __b) { int __cc; @@ -3006,6 +3464,7 @@ vec_all_le(vector unsigned short __a, vector bool short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool short __a, vector unsigned short __b) { int __cc; @@ -3013,6 +3472,7 @@ vec_all_le(vector bool short __a, vector unsigned short __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool short __a, vector bool short __b) { int __cc; @@ -3028,6 +3488,7 @@ vec_all_le(vector signed int __a, vector signed int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector signed int __a, vector bool int __b) { int __cc; @@ -3035,6 +3496,7 @@ vec_all_le(vector signed int __a, vector bool int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool int __a, vector signed int __b) { int __cc; @@ -3049,6 +3511,7 @@ vec_all_le(vector unsigned int __a, vector unsigned int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector unsigned int __a, vector bool int __b) { int __cc; @@ -3056,6 +3519,7 @@ vec_all_le(vector unsigned int __a, vector bool int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool int __a, vector unsigned int __b) { int __cc; @@ -3063,6 +3527,7 @@ vec_all_le(vector bool int __a, vector unsigned int __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool int __a, vector bool int __b) { int __cc; @@ -3078,6 +3543,7 @@ vec_all_le(vector signed long long __a, vector signed long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector signed long long __a, vector bool long long __b) { int __cc; @@ -3085,6 +3551,7 @@ vec_all_le(vector signed long long __a, vector bool long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool long long __a, vector signed long long __b) { int __cc; @@ -3099,6 +3566,7 @@ vec_all_le(vector unsigned long long __a, vector unsigned long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -3106,6 +3574,7 @@ vec_all_le(vector unsigned long long __a, vector bool long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -3113,6 +3582,7 @@ vec_all_le(vector bool long long __a, vector unsigned long long __b) { return __cc == 3; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_le(vector bool long long __a, vector bool long long __b) { int __cc; @@ -3121,6 +3591,15 @@ vec_all_le(vector bool long long __a, vector bool long long __b) { return __cc == 3; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_le(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchesbs(__b, __a, &__cc); + return __cc == 0; +} +#endif + static inline __ATTRS_o_ai int vec_all_le(vector double __a, vector double __b) { int __cc; @@ -3137,6 +3616,7 @@ vec_all_lt(vector signed char __a, vector signed char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector signed char __a, vector bool char __b) { int __cc; @@ -3144,6 +3624,7 @@ vec_all_lt(vector signed char __a, vector bool char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool char __a, vector signed char __b) { int __cc; @@ -3158,6 +3639,7 @@ vec_all_lt(vector unsigned char __a, vector unsigned char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector unsigned char __a, vector bool char __b) { int __cc; @@ -3165,6 +3647,7 @@ vec_all_lt(vector unsigned char __a, vector bool char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool char __a, vector unsigned char __b) { int __cc; @@ -3172,6 +3655,7 @@ vec_all_lt(vector bool char __a, vector unsigned char __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool char __a, vector bool char __b) { int __cc; @@ -3187,6 +3671,7 @@ vec_all_lt(vector signed short __a, vector signed short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector signed short __a, vector bool short __b) { int __cc; @@ -3194,6 +3679,7 @@ vec_all_lt(vector signed short __a, vector bool short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool short __a, vector signed short __b) { int __cc; @@ -3208,6 +3694,7 @@ vec_all_lt(vector unsigned short __a, vector unsigned short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector unsigned short __a, vector bool short __b) { int __cc; @@ -3215,6 +3702,7 @@ vec_all_lt(vector unsigned short __a, vector bool short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool short __a, vector unsigned short __b) { int __cc; @@ -3222,6 +3710,7 @@ vec_all_lt(vector bool short __a, vector unsigned short __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool short __a, vector bool short __b) { int __cc; @@ -3237,6 +3726,7 @@ vec_all_lt(vector signed int __a, vector signed int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector signed int __a, vector bool int __b) { int __cc; @@ -3244,6 +3734,7 @@ vec_all_lt(vector signed int __a, vector bool int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool int __a, vector signed int __b) { int __cc; @@ -3258,6 +3749,7 @@ vec_all_lt(vector unsigned int __a, vector unsigned int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector unsigned int __a, vector bool int __b) { int __cc; @@ -3265,6 +3757,7 @@ vec_all_lt(vector unsigned int __a, vector bool int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool int __a, vector unsigned int __b) { int __cc; @@ -3272,6 +3765,7 @@ vec_all_lt(vector bool int __a, vector unsigned int __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool int __a, vector bool int __b) { int __cc; @@ -3287,6 +3781,7 @@ vec_all_lt(vector signed long long __a, vector signed long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector signed long long __a, vector bool long long __b) { int __cc; @@ -3294,6 +3789,7 @@ vec_all_lt(vector signed long long __a, vector bool long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool long long __a, vector signed long long __b) { int __cc; @@ -3308,6 +3804,7 @@ vec_all_lt(vector unsigned long long __a, vector unsigned long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -3315,6 +3812,7 @@ vec_all_lt(vector unsigned long long __a, vector bool long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -3322,6 +3820,7 @@ vec_all_lt(vector bool long long __a, vector unsigned long long __b) { return __cc == 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_all_lt(vector bool long long __a, vector bool long long __b) { int __cc; @@ -3330,6 +3829,15 @@ vec_all_lt(vector bool long long __a, vector bool long long __b) { return __cc == 0; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_lt(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchsbs(__b, __a, &__cc); + return __cc == 0; +} +#endif + static inline __ATTRS_o_ai int vec_all_lt(vector double __a, vector double __b) { int __cc; @@ -3339,7 +3847,16 @@ vec_all_lt(vector double __a, vector double __b) { /*-- vec_all_nge ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_nge(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchesbs(__a, __b, &__cc); + return __cc == 3; +} +#endif + +static inline __ATTRS_o_ai int vec_all_nge(vector double __a, vector double __b) { int __cc; __builtin_s390_vfchedbs(__a, __b, &__cc); @@ -3348,7 +3865,16 @@ vec_all_nge(vector double __a, vector double __b) { /*-- vec_all_ngt ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_ngt(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchsbs(__a, __b, &__cc); + return __cc == 3; +} +#endif + +static inline __ATTRS_o_ai int vec_all_ngt(vector double __a, vector double __b) { int __cc; __builtin_s390_vfchdbs(__a, __b, &__cc); @@ -3357,7 +3883,16 @@ vec_all_ngt(vector double __a, vector double __b) { /*-- vec_all_nle ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_nle(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchesbs(__b, __a, &__cc); + return __cc == 3; +} +#endif + +static inline __ATTRS_o_ai int vec_all_nle(vector double __a, vector double __b) { int __cc; __builtin_s390_vfchedbs(__b, __a, &__cc); @@ -3366,7 +3901,16 @@ vec_all_nle(vector double __a, vector double __b) { /*-- vec_all_nlt ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_nlt(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchsbs(__b, __a, &__cc); + return __cc == 3; +} +#endif + +static inline __ATTRS_o_ai int vec_all_nlt(vector double __a, vector double __b) { int __cc; __builtin_s390_vfchdbs(__b, __a, &__cc); @@ -3375,7 +3919,16 @@ vec_all_nlt(vector double __a, vector double __b) { /*-- vec_all_nan ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_nan(vector float __a) { + int __cc; + __builtin_s390_vftcisb(__a, 15, &__cc); + return __cc == 0; +} +#endif + +static inline __ATTRS_o_ai int vec_all_nan(vector double __a) { int __cc; __builtin_s390_vftcidb(__a, 15, &__cc); @@ -3384,7 +3937,16 @@ vec_all_nan(vector double __a) { /*-- vec_all_numeric --------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_all_numeric(vector float __a) { + int __cc; + __builtin_s390_vftcisb(__a, 15, &__cc); + return __cc == 3; +} +#endif + +static inline __ATTRS_o_ai int vec_all_numeric(vector double __a) { int __cc; __builtin_s390_vftcidb(__a, 15, &__cc); @@ -3400,6 +3962,7 @@ vec_any_eq(vector signed char __a, vector signed char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector signed char __a, vector bool char __b) { int __cc; @@ -3407,6 +3970,7 @@ vec_any_eq(vector signed char __a, vector bool char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector bool char __a, vector signed char __b) { int __cc; @@ -3422,6 +3986,7 @@ vec_any_eq(vector unsigned char __a, vector unsigned char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector unsigned char __a, vector bool char __b) { int __cc; @@ -3430,6 +3995,7 @@ vec_any_eq(vector unsigned char __a, vector bool char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector bool char __a, vector unsigned char __b) { int __cc; @@ -3453,6 +4019,7 @@ vec_any_eq(vector signed short __a, vector signed short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector signed short __a, vector bool short __b) { int __cc; @@ -3460,6 +4027,7 @@ vec_any_eq(vector signed short __a, vector bool short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector bool short __a, vector signed short __b) { int __cc; @@ -3475,6 +4043,7 @@ vec_any_eq(vector unsigned short __a, vector unsigned short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector unsigned short __a, vector bool short __b) { int __cc; @@ -3483,6 +4052,7 @@ vec_any_eq(vector unsigned short __a, vector bool short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector bool short __a, vector unsigned short __b) { int __cc; @@ -3506,6 +4076,7 @@ vec_any_eq(vector signed int __a, vector signed int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector signed int __a, vector bool int __b) { int __cc; @@ -3513,6 +4084,7 @@ vec_any_eq(vector signed int __a, vector bool int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector bool int __a, vector signed int __b) { int __cc; @@ -3528,6 +4100,7 @@ vec_any_eq(vector unsigned int __a, vector unsigned int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector unsigned int __a, vector bool int __b) { int __cc; @@ -3536,6 +4109,7 @@ vec_any_eq(vector unsigned int __a, vector bool int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector bool int __a, vector unsigned int __b) { int __cc; @@ -3559,6 +4133,7 @@ vec_any_eq(vector signed long long __a, vector signed long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector signed long long __a, vector bool long long __b) { int __cc; @@ -3566,6 +4141,7 @@ vec_any_eq(vector signed long long __a, vector bool long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector bool long long __a, vector signed long long __b) { int __cc; @@ -3581,6 +4157,7 @@ vec_any_eq(vector unsigned long long __a, vector unsigned long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -3589,6 +4166,7 @@ vec_any_eq(vector unsigned long long __a, vector bool long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_eq(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -3605,6 +4183,15 @@ vec_any_eq(vector bool long long __a, vector bool long long __b) { return __cc <= 1; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_eq(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfcesbs(__a, __b, &__cc); + return __cc <= 1; +} +#endif + static inline __ATTRS_o_ai int vec_any_eq(vector double __a, vector double __b) { int __cc; @@ -3621,6 +4208,7 @@ vec_any_ne(vector signed char __a, vector signed char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector signed char __a, vector bool char __b) { int __cc; @@ -3628,6 +4216,7 @@ vec_any_ne(vector signed char __a, vector bool char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector bool char __a, vector signed char __b) { int __cc; @@ -3643,6 +4232,7 @@ vec_any_ne(vector unsigned char __a, vector unsigned char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector unsigned char __a, vector bool char __b) { int __cc; @@ -3651,6 +4241,7 @@ vec_any_ne(vector unsigned char __a, vector bool char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector bool char __a, vector unsigned char __b) { int __cc; @@ -3674,6 +4265,7 @@ vec_any_ne(vector signed short __a, vector signed short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector signed short __a, vector bool short __b) { int __cc; @@ -3681,6 +4273,7 @@ vec_any_ne(vector signed short __a, vector bool short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector bool short __a, vector signed short __b) { int __cc; @@ -3696,6 +4289,7 @@ vec_any_ne(vector unsigned short __a, vector unsigned short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector unsigned short __a, vector bool short __b) { int __cc; @@ -3704,6 +4298,7 @@ vec_any_ne(vector unsigned short __a, vector bool short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector bool short __a, vector unsigned short __b) { int __cc; @@ -3727,6 +4322,7 @@ vec_any_ne(vector signed int __a, vector signed int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector signed int __a, vector bool int __b) { int __cc; @@ -3734,6 +4330,7 @@ vec_any_ne(vector signed int __a, vector bool int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector bool int __a, vector signed int __b) { int __cc; @@ -3749,6 +4346,7 @@ vec_any_ne(vector unsigned int __a, vector unsigned int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector unsigned int __a, vector bool int __b) { int __cc; @@ -3757,6 +4355,7 @@ vec_any_ne(vector unsigned int __a, vector bool int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector bool int __a, vector unsigned int __b) { int __cc; @@ -3780,6 +4379,7 @@ vec_any_ne(vector signed long long __a, vector signed long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector signed long long __a, vector bool long long __b) { int __cc; @@ -3787,6 +4387,7 @@ vec_any_ne(vector signed long long __a, vector bool long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector bool long long __a, vector signed long long __b) { int __cc; @@ -3802,6 +4403,7 @@ vec_any_ne(vector unsigned long long __a, vector unsigned long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -3810,6 +4412,7 @@ vec_any_ne(vector unsigned long long __a, vector bool long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ne(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -3826,6 +4429,15 @@ vec_any_ne(vector bool long long __a, vector bool long long __b) { return __cc != 0; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_ne(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfcesbs(__a, __b, &__cc); + return __cc != 0; +} +#endif + static inline __ATTRS_o_ai int vec_any_ne(vector double __a, vector double __b) { int __cc; @@ -3842,6 +4454,7 @@ vec_any_ge(vector signed char __a, vector signed char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector signed char __a, vector bool char __b) { int __cc; @@ -3849,6 +4462,7 @@ vec_any_ge(vector signed char __a, vector bool char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool char __a, vector signed char __b) { int __cc; @@ -3863,6 +4477,7 @@ vec_any_ge(vector unsigned char __a, vector unsigned char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector unsigned char __a, vector bool char __b) { int __cc; @@ -3870,6 +4485,7 @@ vec_any_ge(vector unsigned char __a, vector bool char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool char __a, vector unsigned char __b) { int __cc; @@ -3877,6 +4493,7 @@ vec_any_ge(vector bool char __a, vector unsigned char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool char __a, vector bool char __b) { int __cc; @@ -3892,6 +4509,7 @@ vec_any_ge(vector signed short __a, vector signed short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector signed short __a, vector bool short __b) { int __cc; @@ -3899,6 +4517,7 @@ vec_any_ge(vector signed short __a, vector bool short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool short __a, vector signed short __b) { int __cc; @@ -3913,6 +4532,7 @@ vec_any_ge(vector unsigned short __a, vector unsigned short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector unsigned short __a, vector bool short __b) { int __cc; @@ -3920,6 +4540,7 @@ vec_any_ge(vector unsigned short __a, vector bool short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool short __a, vector unsigned short __b) { int __cc; @@ -3927,6 +4548,7 @@ vec_any_ge(vector bool short __a, vector unsigned short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool short __a, vector bool short __b) { int __cc; @@ -3942,6 +4564,7 @@ vec_any_ge(vector signed int __a, vector signed int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector signed int __a, vector bool int __b) { int __cc; @@ -3949,6 +4572,7 @@ vec_any_ge(vector signed int __a, vector bool int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool int __a, vector signed int __b) { int __cc; @@ -3963,6 +4587,7 @@ vec_any_ge(vector unsigned int __a, vector unsigned int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector unsigned int __a, vector bool int __b) { int __cc; @@ -3970,6 +4595,7 @@ vec_any_ge(vector unsigned int __a, vector bool int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool int __a, vector unsigned int __b) { int __cc; @@ -3977,6 +4603,7 @@ vec_any_ge(vector bool int __a, vector unsigned int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool int __a, vector bool int __b) { int __cc; @@ -3992,6 +4619,7 @@ vec_any_ge(vector signed long long __a, vector signed long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector signed long long __a, vector bool long long __b) { int __cc; @@ -3999,6 +4627,7 @@ vec_any_ge(vector signed long long __a, vector bool long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool long long __a, vector signed long long __b) { int __cc; @@ -4013,6 +4642,7 @@ vec_any_ge(vector unsigned long long __a, vector unsigned long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -4020,6 +4650,7 @@ vec_any_ge(vector unsigned long long __a, vector bool long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -4027,6 +4658,7 @@ vec_any_ge(vector bool long long __a, vector unsigned long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_ge(vector bool long long __a, vector bool long long __b) { int __cc; @@ -4035,6 +4667,15 @@ vec_any_ge(vector bool long long __a, vector bool long long __b) { return __cc != 0; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_ge(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchesbs(__a, __b, &__cc); + return __cc <= 1; +} +#endif + static inline __ATTRS_o_ai int vec_any_ge(vector double __a, vector double __b) { int __cc; @@ -4051,6 +4692,7 @@ vec_any_gt(vector signed char __a, vector signed char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector signed char __a, vector bool char __b) { int __cc; @@ -4058,6 +4700,7 @@ vec_any_gt(vector signed char __a, vector bool char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool char __a, vector signed char __b) { int __cc; @@ -4072,6 +4715,7 @@ vec_any_gt(vector unsigned char __a, vector unsigned char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector unsigned char __a, vector bool char __b) { int __cc; @@ -4079,6 +4723,7 @@ vec_any_gt(vector unsigned char __a, vector bool char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool char __a, vector unsigned char __b) { int __cc; @@ -4086,6 +4731,7 @@ vec_any_gt(vector bool char __a, vector unsigned char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool char __a, vector bool char __b) { int __cc; @@ -4101,6 +4747,7 @@ vec_any_gt(vector signed short __a, vector signed short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector signed short __a, vector bool short __b) { int __cc; @@ -4108,6 +4755,7 @@ vec_any_gt(vector signed short __a, vector bool short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool short __a, vector signed short __b) { int __cc; @@ -4122,6 +4770,7 @@ vec_any_gt(vector unsigned short __a, vector unsigned short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector unsigned short __a, vector bool short __b) { int __cc; @@ -4129,6 +4778,7 @@ vec_any_gt(vector unsigned short __a, vector bool short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool short __a, vector unsigned short __b) { int __cc; @@ -4136,6 +4786,7 @@ vec_any_gt(vector bool short __a, vector unsigned short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool short __a, vector bool short __b) { int __cc; @@ -4151,6 +4802,7 @@ vec_any_gt(vector signed int __a, vector signed int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector signed int __a, vector bool int __b) { int __cc; @@ -4158,6 +4810,7 @@ vec_any_gt(vector signed int __a, vector bool int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool int __a, vector signed int __b) { int __cc; @@ -4172,6 +4825,7 @@ vec_any_gt(vector unsigned int __a, vector unsigned int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector unsigned int __a, vector bool int __b) { int __cc; @@ -4179,6 +4833,7 @@ vec_any_gt(vector unsigned int __a, vector bool int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool int __a, vector unsigned int __b) { int __cc; @@ -4186,6 +4841,7 @@ vec_any_gt(vector bool int __a, vector unsigned int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool int __a, vector bool int __b) { int __cc; @@ -4201,6 +4857,7 @@ vec_any_gt(vector signed long long __a, vector signed long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector signed long long __a, vector bool long long __b) { int __cc; @@ -4208,6 +4865,7 @@ vec_any_gt(vector signed long long __a, vector bool long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool long long __a, vector signed long long __b) { int __cc; @@ -4222,6 +4880,7 @@ vec_any_gt(vector unsigned long long __a, vector unsigned long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -4229,6 +4888,7 @@ vec_any_gt(vector unsigned long long __a, vector bool long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -4236,6 +4896,7 @@ vec_any_gt(vector bool long long __a, vector unsigned long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_gt(vector bool long long __a, vector bool long long __b) { int __cc; @@ -4244,6 +4905,15 @@ vec_any_gt(vector bool long long __a, vector bool long long __b) { return __cc <= 1; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_gt(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchsbs(__a, __b, &__cc); + return __cc <= 1; +} +#endif + static inline __ATTRS_o_ai int vec_any_gt(vector double __a, vector double __b) { int __cc; @@ -4260,6 +4930,7 @@ vec_any_le(vector signed char __a, vector signed char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector signed char __a, vector bool char __b) { int __cc; @@ -4267,6 +4938,7 @@ vec_any_le(vector signed char __a, vector bool char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool char __a, vector signed char __b) { int __cc; @@ -4281,6 +4953,7 @@ vec_any_le(vector unsigned char __a, vector unsigned char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector unsigned char __a, vector bool char __b) { int __cc; @@ -4288,6 +4961,7 @@ vec_any_le(vector unsigned char __a, vector bool char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool char __a, vector unsigned char __b) { int __cc; @@ -4295,6 +4969,7 @@ vec_any_le(vector bool char __a, vector unsigned char __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool char __a, vector bool char __b) { int __cc; @@ -4310,6 +4985,7 @@ vec_any_le(vector signed short __a, vector signed short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector signed short __a, vector bool short __b) { int __cc; @@ -4317,6 +4993,7 @@ vec_any_le(vector signed short __a, vector bool short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool short __a, vector signed short __b) { int __cc; @@ -4331,6 +5008,7 @@ vec_any_le(vector unsigned short __a, vector unsigned short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector unsigned short __a, vector bool short __b) { int __cc; @@ -4338,6 +5016,7 @@ vec_any_le(vector unsigned short __a, vector bool short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool short __a, vector unsigned short __b) { int __cc; @@ -4345,6 +5024,7 @@ vec_any_le(vector bool short __a, vector unsigned short __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool short __a, vector bool short __b) { int __cc; @@ -4360,6 +5040,7 @@ vec_any_le(vector signed int __a, vector signed int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector signed int __a, vector bool int __b) { int __cc; @@ -4367,6 +5048,7 @@ vec_any_le(vector signed int __a, vector bool int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool int __a, vector signed int __b) { int __cc; @@ -4381,6 +5063,7 @@ vec_any_le(vector unsigned int __a, vector unsigned int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector unsigned int __a, vector bool int __b) { int __cc; @@ -4388,6 +5071,7 @@ vec_any_le(vector unsigned int __a, vector bool int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool int __a, vector unsigned int __b) { int __cc; @@ -4395,6 +5079,7 @@ vec_any_le(vector bool int __a, vector unsigned int __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool int __a, vector bool int __b) { int __cc; @@ -4410,6 +5095,7 @@ vec_any_le(vector signed long long __a, vector signed long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector signed long long __a, vector bool long long __b) { int __cc; @@ -4417,6 +5103,7 @@ vec_any_le(vector signed long long __a, vector bool long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool long long __a, vector signed long long __b) { int __cc; @@ -4431,6 +5118,7 @@ vec_any_le(vector unsigned long long __a, vector unsigned long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -4438,6 +5126,7 @@ vec_any_le(vector unsigned long long __a, vector bool long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -4445,6 +5134,7 @@ vec_any_le(vector bool long long __a, vector unsigned long long __b) { return __cc != 0; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_le(vector bool long long __a, vector bool long long __b) { int __cc; @@ -4453,6 +5143,15 @@ vec_any_le(vector bool long long __a, vector bool long long __b) { return __cc != 0; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_le(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchesbs(__b, __a, &__cc); + return __cc <= 1; +} +#endif + static inline __ATTRS_o_ai int vec_any_le(vector double __a, vector double __b) { int __cc; @@ -4469,6 +5168,7 @@ vec_any_lt(vector signed char __a, vector signed char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector signed char __a, vector bool char __b) { int __cc; @@ -4476,6 +5176,7 @@ vec_any_lt(vector signed char __a, vector bool char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool char __a, vector signed char __b) { int __cc; @@ -4490,6 +5191,7 @@ vec_any_lt(vector unsigned char __a, vector unsigned char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector unsigned char __a, vector bool char __b) { int __cc; @@ -4497,6 +5199,7 @@ vec_any_lt(vector unsigned char __a, vector bool char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool char __a, vector unsigned char __b) { int __cc; @@ -4504,6 +5207,7 @@ vec_any_lt(vector bool char __a, vector unsigned char __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool char __a, vector bool char __b) { int __cc; @@ -4519,6 +5223,7 @@ vec_any_lt(vector signed short __a, vector signed short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector signed short __a, vector bool short __b) { int __cc; @@ -4526,6 +5231,7 @@ vec_any_lt(vector signed short __a, vector bool short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool short __a, vector signed short __b) { int __cc; @@ -4540,6 +5246,7 @@ vec_any_lt(vector unsigned short __a, vector unsigned short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector unsigned short __a, vector bool short __b) { int __cc; @@ -4547,6 +5254,7 @@ vec_any_lt(vector unsigned short __a, vector bool short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool short __a, vector unsigned short __b) { int __cc; @@ -4554,6 +5262,7 @@ vec_any_lt(vector bool short __a, vector unsigned short __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool short __a, vector bool short __b) { int __cc; @@ -4569,6 +5278,7 @@ vec_any_lt(vector signed int __a, vector signed int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector signed int __a, vector bool int __b) { int __cc; @@ -4576,6 +5286,7 @@ vec_any_lt(vector signed int __a, vector bool int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool int __a, vector signed int __b) { int __cc; @@ -4590,6 +5301,7 @@ vec_any_lt(vector unsigned int __a, vector unsigned int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector unsigned int __a, vector bool int __b) { int __cc; @@ -4597,6 +5309,7 @@ vec_any_lt(vector unsigned int __a, vector bool int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool int __a, vector unsigned int __b) { int __cc; @@ -4604,6 +5317,7 @@ vec_any_lt(vector bool int __a, vector unsigned int __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool int __a, vector bool int __b) { int __cc; @@ -4619,6 +5333,7 @@ vec_any_lt(vector signed long long __a, vector signed long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector signed long long __a, vector bool long long __b) { int __cc; @@ -4626,6 +5341,7 @@ vec_any_lt(vector signed long long __a, vector bool long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool long long __a, vector signed long long __b) { int __cc; @@ -4640,6 +5356,7 @@ vec_any_lt(vector unsigned long long __a, vector unsigned long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector unsigned long long __a, vector bool long long __b) { int __cc; @@ -4647,6 +5364,7 @@ vec_any_lt(vector unsigned long long __a, vector bool long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool long long __a, vector unsigned long long __b) { int __cc; @@ -4654,6 +5372,7 @@ vec_any_lt(vector bool long long __a, vector unsigned long long __b) { return __cc <= 1; } +// This prototype is deprecated. static inline __ATTRS_o_ai int vec_any_lt(vector bool long long __a, vector bool long long __b) { int __cc; @@ -4662,6 +5381,15 @@ vec_any_lt(vector bool long long __a, vector bool long long __b) { return __cc <= 1; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_lt(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchsbs(__b, __a, &__cc); + return __cc <= 1; +} +#endif + static inline __ATTRS_o_ai int vec_any_lt(vector double __a, vector double __b) { int __cc; @@ -4671,7 +5399,16 @@ vec_any_lt(vector double __a, vector double __b) { /*-- vec_any_nge ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_nge(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchesbs(__a, __b, &__cc); + return __cc != 0; +} +#endif + +static inline __ATTRS_o_ai int vec_any_nge(vector double __a, vector double __b) { int __cc; __builtin_s390_vfchedbs(__a, __b, &__cc); @@ -4680,7 +5417,16 @@ vec_any_nge(vector double __a, vector double __b) { /*-- vec_any_ngt ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_ngt(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchsbs(__a, __b, &__cc); + return __cc != 0; +} +#endif + +static inline __ATTRS_o_ai int vec_any_ngt(vector double __a, vector double __b) { int __cc; __builtin_s390_vfchdbs(__a, __b, &__cc); @@ -4689,7 +5435,16 @@ vec_any_ngt(vector double __a, vector double __b) { /*-- vec_any_nle ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_nle(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchesbs(__b, __a, &__cc); + return __cc != 0; +} +#endif + +static inline __ATTRS_o_ai int vec_any_nle(vector double __a, vector double __b) { int __cc; __builtin_s390_vfchedbs(__b, __a, &__cc); @@ -4698,7 +5453,16 @@ vec_any_nle(vector double __a, vector double __b) { /*-- vec_any_nlt ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_nlt(vector float __a, vector float __b) { + int __cc; + __builtin_s390_vfchsbs(__b, __a, &__cc); + return __cc != 0; +} +#endif + +static inline __ATTRS_o_ai int vec_any_nlt(vector double __a, vector double __b) { int __cc; __builtin_s390_vfchdbs(__b, __a, &__cc); @@ -4707,7 +5471,16 @@ vec_any_nlt(vector double __a, vector double __b) { /*-- vec_any_nan ------------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_nan(vector float __a) { + int __cc; + __builtin_s390_vftcisb(__a, 15, &__cc); + return __cc != 3; +} +#endif + +static inline __ATTRS_o_ai int vec_any_nan(vector double __a) { int __cc; __builtin_s390_vftcidb(__a, 15, &__cc); @@ -4716,7 +5489,16 @@ vec_any_nan(vector double __a) { /*-- vec_any_numeric --------------------------------------------------------*/ -static inline __ATTRS_ai int +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_any_numeric(vector float __a) { + int __cc; + __builtin_s390_vftcisb(__a, 15, &__cc); + return __cc != 0; +} +#endif + +static inline __ATTRS_o_ai int vec_any_numeric(vector double __a) { int __cc; __builtin_s390_vftcidb(__a, 15, &__cc); @@ -4735,11 +5517,13 @@ vec_andc(vector signed char __a, vector signed char __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_andc(vector bool char __a, vector signed char __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_andc(vector signed char __a, vector bool char __b) { return __a & ~__b; @@ -4750,11 +5534,13 @@ vec_andc(vector unsigned char __a, vector unsigned char __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_andc(vector bool char __a, vector unsigned char __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_andc(vector unsigned char __a, vector bool char __b) { return __a & ~__b; @@ -4770,11 +5556,13 @@ vec_andc(vector signed short __a, vector signed short __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_andc(vector bool short __a, vector signed short __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_andc(vector signed short __a, vector bool short __b) { return __a & ~__b; @@ -4785,11 +5573,13 @@ vec_andc(vector unsigned short __a, vector unsigned short __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_andc(vector bool short __a, vector unsigned short __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_andc(vector unsigned short __a, vector bool short __b) { return __a & ~__b; @@ -4805,11 +5595,13 @@ vec_andc(vector signed int __a, vector signed int __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_andc(vector bool int __a, vector signed int __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_andc(vector signed int __a, vector bool int __b) { return __a & ~__b; @@ -4820,11 +5612,13 @@ vec_andc(vector unsigned int __a, vector unsigned int __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_andc(vector bool int __a, vector unsigned int __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_andc(vector unsigned int __a, vector bool int __b) { return __a & ~__b; @@ -4840,11 +5634,13 @@ vec_andc(vector signed long long __a, vector signed long long __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_andc(vector bool long long __a, vector signed long long __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_andc(vector signed long long __a, vector bool long long __b) { return __a & ~__b; @@ -4855,28 +5651,40 @@ vec_andc(vector unsigned long long __a, vector unsigned long long __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_andc(vector bool long long __a, vector unsigned long long __b) { return __a & ~__b; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_andc(vector unsigned long long __a, vector bool long long __b) { return __a & ~__b; } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_andc(vector float __a, vector float __b) { + return (vector float)((vector unsigned int)__a & + ~(vector unsigned int)__b); +} +#endif + static inline __ATTRS_o_ai vector double vec_andc(vector double __a, vector double __b) { return (vector double)((vector unsigned long long)__a & ~(vector unsigned long long)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector double vec_andc(vector bool long long __a, vector double __b) { return (vector double)((vector unsigned long long)__a & ~(vector unsigned long long)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector double vec_andc(vector double __a, vector bool long long __b) { return (vector double)((vector unsigned long long)__a & @@ -4895,11 +5703,13 @@ vec_nor(vector signed char __a, vector signed char __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_nor(vector bool char __a, vector signed char __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_nor(vector signed char __a, vector bool char __b) { return ~(__a | __b); @@ -4910,11 +5720,13 @@ vec_nor(vector unsigned char __a, vector unsigned char __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_nor(vector bool char __a, vector unsigned char __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_nor(vector unsigned char __a, vector bool char __b) { return ~(__a | __b); @@ -4930,11 +5742,13 @@ vec_nor(vector signed short __a, vector signed short __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_nor(vector bool short __a, vector signed short __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_nor(vector signed short __a, vector bool short __b) { return ~(__a | __b); @@ -4945,11 +5759,13 @@ vec_nor(vector unsigned short __a, vector unsigned short __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_nor(vector bool short __a, vector unsigned short __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_nor(vector unsigned short __a, vector bool short __b) { return ~(__a | __b); @@ -4965,11 +5781,13 @@ vec_nor(vector signed int __a, vector signed int __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_nor(vector bool int __a, vector signed int __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_nor(vector signed int __a, vector bool int __b) { return ~(__a | __b); @@ -4980,11 +5798,13 @@ vec_nor(vector unsigned int __a, vector unsigned int __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_nor(vector bool int __a, vector unsigned int __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_nor(vector unsigned int __a, vector bool int __b) { return ~(__a | __b); @@ -5000,11 +5820,13 @@ vec_nor(vector signed long long __a, vector signed long long __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_nor(vector bool long long __a, vector signed long long __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_nor(vector signed long long __a, vector bool long long __b) { return ~(__a | __b); @@ -5015,34 +5837,274 @@ vec_nor(vector unsigned long long __a, vector unsigned long long __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_nor(vector bool long long __a, vector unsigned long long __b) { return ~(__a | __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_nor(vector unsigned long long __a, vector bool long long __b) { return ~(__a | __b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_nor(vector float __a, vector float __b) { + return (vector float)~((vector unsigned int)__a | + (vector unsigned int)__b); +} +#endif + static inline __ATTRS_o_ai vector double vec_nor(vector double __a, vector double __b) { return (vector double)~((vector unsigned long long)__a | (vector unsigned long long)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector double vec_nor(vector bool long long __a, vector double __b) { return (vector double)~((vector unsigned long long)__a | (vector unsigned long long)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector double vec_nor(vector double __a, vector bool long long __b) { return (vector double)~((vector unsigned long long)__a | (vector unsigned long long)__b); } +/*-- vec_orc ----------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector bool char +vec_orc(vector bool char __a, vector bool char __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector signed char +vec_orc(vector signed char __a, vector signed char __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector unsigned char +vec_orc(vector unsigned char __a, vector unsigned char __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector bool short +vec_orc(vector bool short __a, vector bool short __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector signed short +vec_orc(vector signed short __a, vector signed short __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector unsigned short +vec_orc(vector unsigned short __a, vector unsigned short __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector bool int +vec_orc(vector bool int __a, vector bool int __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector signed int +vec_orc(vector signed int __a, vector signed int __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector unsigned int +vec_orc(vector unsigned int __a, vector unsigned int __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector bool long long +vec_orc(vector bool long long __a, vector bool long long __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector signed long long +vec_orc(vector signed long long __a, vector signed long long __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector unsigned long long +vec_orc(vector unsigned long long __a, vector unsigned long long __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai vector float +vec_orc(vector float __a, vector float __b) { + return (vector float)((vector unsigned int)__a & + ~(vector unsigned int)__b); +} + +static inline __ATTRS_o_ai vector double +vec_orc(vector double __a, vector double __b) { + return (vector double)((vector unsigned long long)__a & + ~(vector unsigned long long)__b); +} +#endif + +/*-- vec_nand ---------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector bool char +vec_nand(vector bool char __a, vector bool char __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector signed char +vec_nand(vector signed char __a, vector signed char __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_nand(vector unsigned char __a, vector unsigned char __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector bool short +vec_nand(vector bool short __a, vector bool short __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector signed short +vec_nand(vector signed short __a, vector signed short __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector unsigned short +vec_nand(vector unsigned short __a, vector unsigned short __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector bool int +vec_nand(vector bool int __a, vector bool int __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector signed int +vec_nand(vector signed int __a, vector signed int __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector unsigned int +vec_nand(vector unsigned int __a, vector unsigned int __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector bool long long +vec_nand(vector bool long long __a, vector bool long long __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector signed long long +vec_nand(vector signed long long __a, vector signed long long __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector unsigned long long +vec_nand(vector unsigned long long __a, vector unsigned long long __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai vector float +vec_nand(vector float __a, vector float __b) { + return (vector float)~((vector unsigned int)__a & + (vector unsigned int)__b); +} + +static inline __ATTRS_o_ai vector double +vec_nand(vector double __a, vector double __b) { + return (vector double)~((vector unsigned long long)__a & + (vector unsigned long long)__b); +} +#endif + +/*-- vec_eqv ----------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector bool char +vec_eqv(vector bool char __a, vector bool char __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector signed char +vec_eqv(vector signed char __a, vector signed char __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_eqv(vector unsigned char __a, vector unsigned char __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector bool short +vec_eqv(vector bool short __a, vector bool short __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector signed short +vec_eqv(vector signed short __a, vector signed short __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector unsigned short +vec_eqv(vector unsigned short __a, vector unsigned short __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector bool int +vec_eqv(vector bool int __a, vector bool int __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector signed int +vec_eqv(vector signed int __a, vector signed int __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector unsigned int +vec_eqv(vector unsigned int __a, vector unsigned int __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector bool long long +vec_eqv(vector bool long long __a, vector bool long long __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector signed long long +vec_eqv(vector signed long long __a, vector signed long long __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector unsigned long long +vec_eqv(vector unsigned long long __a, vector unsigned long long __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai vector float +vec_eqv(vector float __a, vector float __b) { + return (vector float)~((vector unsigned int)__a ^ + (vector unsigned int)__b); +} + +static inline __ATTRS_o_ai vector double +vec_eqv(vector double __a, vector double __b) { + return (vector double)~((vector unsigned long long)__a ^ + (vector unsigned long long)__b); +} +#endif + /*-- vec_cntlz --------------------------------------------------------------*/ static inline __ATTRS_o_ai vector unsigned char @@ -5323,30 +6385,35 @@ vec_sll(vector signed char __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_sll(vector signed char __a, vector unsigned short __b) { return (vector signed char)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_sll(vector signed char __a, vector unsigned int __b) { return (vector signed char)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool char vec_sll(vector bool char __a, vector unsigned char __b) { return (vector bool char)__builtin_s390_vsl( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool char vec_sll(vector bool char __a, vector unsigned short __b) { return (vector bool char)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool char vec_sll(vector bool char __a, vector unsigned int __b) { return (vector bool char)__builtin_s390_vsl( @@ -5358,11 +6425,13 @@ vec_sll(vector unsigned char __a, vector unsigned char __b) { return __builtin_s390_vsl(__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_sll(vector unsigned char __a, vector unsigned short __b) { return __builtin_s390_vsl(__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_sll(vector unsigned char __a, vector unsigned int __b) { return __builtin_s390_vsl(__a, (vector unsigned char)__b); @@ -5374,30 +6443,35 @@ vec_sll(vector signed short __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_sll(vector signed short __a, vector unsigned short __b) { return (vector signed short)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_sll(vector signed short __a, vector unsigned int __b) { return (vector signed short)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool short vec_sll(vector bool short __a, vector unsigned char __b) { return (vector bool short)__builtin_s390_vsl( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool short vec_sll(vector bool short __a, vector unsigned short __b) { return (vector bool short)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool short vec_sll(vector bool short __a, vector unsigned int __b) { return (vector bool short)__builtin_s390_vsl( @@ -5410,12 +6484,14 @@ vec_sll(vector unsigned short __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_sll(vector unsigned short __a, vector unsigned short __b) { return (vector unsigned short)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_sll(vector unsigned short __a, vector unsigned int __b) { return (vector unsigned short)__builtin_s390_vsl( @@ -5428,30 +6504,35 @@ vec_sll(vector signed int __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_sll(vector signed int __a, vector unsigned short __b) { return (vector signed int)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_sll(vector signed int __a, vector unsigned int __b) { return (vector signed int)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool int vec_sll(vector bool int __a, vector unsigned char __b) { return (vector bool int)__builtin_s390_vsl( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool int vec_sll(vector bool int __a, vector unsigned short __b) { return (vector bool int)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool int vec_sll(vector bool int __a, vector unsigned int __b) { return (vector bool int)__builtin_s390_vsl( @@ -5464,12 +6545,14 @@ vec_sll(vector unsigned int __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_sll(vector unsigned int __a, vector unsigned short __b) { return (vector unsigned int)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_sll(vector unsigned int __a, vector unsigned int __b) { return (vector unsigned int)__builtin_s390_vsl( @@ -5482,30 +6565,35 @@ vec_sll(vector signed long long __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_sll(vector signed long long __a, vector unsigned short __b) { return (vector signed long long)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_sll(vector signed long long __a, vector unsigned int __b) { return (vector signed long long)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool long long vec_sll(vector bool long long __a, vector unsigned char __b) { return (vector bool long long)__builtin_s390_vsl( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool long long vec_sll(vector bool long long __a, vector unsigned short __b) { return (vector bool long long)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool long long vec_sll(vector bool long long __a, vector unsigned int __b) { return (vector bool long long)__builtin_s390_vsl( @@ -5518,12 +6606,14 @@ vec_sll(vector unsigned long long __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_sll(vector unsigned long long __a, vector unsigned short __b) { return (vector unsigned long long)__builtin_s390_vsl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_sll(vector unsigned long long __a, vector unsigned int __b) { return (vector unsigned long long)__builtin_s390_vsl( @@ -5626,6 +6716,20 @@ vec_slb(vector unsigned long long __a, vector unsigned long long __b) { (vector unsigned char)__a, (vector unsigned char)__b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_slb(vector float __a, vector signed int __b) { + return (vector float)__builtin_s390_vslb( + (vector unsigned char)__a, (vector unsigned char)__b); +} + +static inline __ATTRS_o_ai vector float +vec_slb(vector float __a, vector unsigned int __b) { + return (vector float)__builtin_s390_vslb( + (vector unsigned char)__a, (vector unsigned char)__b); +} +#endif + static inline __ATTRS_o_ai vector double vec_slb(vector double __a, vector signed long long __b) { return (vector double)__builtin_s390_vslb( @@ -5644,6 +6748,10 @@ extern __ATTRS_o vector signed char vec_sld(vector signed char __a, vector signed char __b, int __c) __constant_range(__c, 0, 15); +extern __ATTRS_o vector bool char +vec_sld(vector bool char __a, vector bool char __b, int __c) + __constant_range(__c, 0, 15); + extern __ATTRS_o vector unsigned char vec_sld(vector unsigned char __a, vector unsigned char __b, int __c) __constant_range(__c, 0, 15); @@ -5652,6 +6760,10 @@ extern __ATTRS_o vector signed short vec_sld(vector signed short __a, vector signed short __b, int __c) __constant_range(__c, 0, 15); +extern __ATTRS_o vector bool short +vec_sld(vector bool short __a, vector bool short __b, int __c) + __constant_range(__c, 0, 15); + extern __ATTRS_o vector unsigned short vec_sld(vector unsigned short __a, vector unsigned short __b, int __c) __constant_range(__c, 0, 15); @@ -5660,6 +6772,10 @@ extern __ATTRS_o vector signed int vec_sld(vector signed int __a, vector signed int __b, int __c) __constant_range(__c, 0, 15); +extern __ATTRS_o vector bool int +vec_sld(vector bool int __a, vector bool int __b, int __c) + __constant_range(__c, 0, 15); + extern __ATTRS_o vector unsigned int vec_sld(vector unsigned int __a, vector unsigned int __b, int __c) __constant_range(__c, 0, 15); @@ -5668,10 +6784,20 @@ extern __ATTRS_o vector signed long long vec_sld(vector signed long long __a, vector signed long long __b, int __c) __constant_range(__c, 0, 15); +extern __ATTRS_o vector bool long long +vec_sld(vector bool long long __a, vector bool long long __b, int __c) + __constant_range(__c, 0, 15); + extern __ATTRS_o vector unsigned long long vec_sld(vector unsigned long long __a, vector unsigned long long __b, int __c) __constant_range(__c, 0, 15); +#if __ARCH__ >= 12 +extern __ATTRS_o vector float +vec_sld(vector float __a, vector float __b, int __c) + __constant_range(__c, 0, 15); +#endif + extern __ATTRS_o vector double vec_sld(vector double __a, vector double __b, int __c) __constant_range(__c, 0, 15); @@ -5714,6 +6840,7 @@ extern __ATTRS_o vector unsigned long long vec_sldw(vector unsigned long long __a, vector unsigned long long __b, int __c) __constant_range(__c, 0, 3); +// This prototype is deprecated. extern __ATTRS_o vector double vec_sldw(vector double __a, vector double __b, int __c) __constant_range(__c, 0, 3); @@ -5730,30 +6857,35 @@ vec_sral(vector signed char __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_sral(vector signed char __a, vector unsigned short __b) { return (vector signed char)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_sral(vector signed char __a, vector unsigned int __b) { return (vector signed char)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool char vec_sral(vector bool char __a, vector unsigned char __b) { return (vector bool char)__builtin_s390_vsra( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool char vec_sral(vector bool char __a, vector unsigned short __b) { return (vector bool char)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool char vec_sral(vector bool char __a, vector unsigned int __b) { return (vector bool char)__builtin_s390_vsra( @@ -5765,11 +6897,13 @@ vec_sral(vector unsigned char __a, vector unsigned char __b) { return __builtin_s390_vsra(__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_sral(vector unsigned char __a, vector unsigned short __b) { return __builtin_s390_vsra(__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_sral(vector unsigned char __a, vector unsigned int __b) { return __builtin_s390_vsra(__a, (vector unsigned char)__b); @@ -5781,30 +6915,35 @@ vec_sral(vector signed short __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_sral(vector signed short __a, vector unsigned short __b) { return (vector signed short)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_sral(vector signed short __a, vector unsigned int __b) { return (vector signed short)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool short vec_sral(vector bool short __a, vector unsigned char __b) { return (vector bool short)__builtin_s390_vsra( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool short vec_sral(vector bool short __a, vector unsigned short __b) { return (vector bool short)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool short vec_sral(vector bool short __a, vector unsigned int __b) { return (vector bool short)__builtin_s390_vsra( @@ -5817,12 +6956,14 @@ vec_sral(vector unsigned short __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_sral(vector unsigned short __a, vector unsigned short __b) { return (vector unsigned short)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_sral(vector unsigned short __a, vector unsigned int __b) { return (vector unsigned short)__builtin_s390_vsra( @@ -5835,30 +6976,35 @@ vec_sral(vector signed int __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_sral(vector signed int __a, vector unsigned short __b) { return (vector signed int)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_sral(vector signed int __a, vector unsigned int __b) { return (vector signed int)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool int vec_sral(vector bool int __a, vector unsigned char __b) { return (vector bool int)__builtin_s390_vsra( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool int vec_sral(vector bool int __a, vector unsigned short __b) { return (vector bool int)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool int vec_sral(vector bool int __a, vector unsigned int __b) { return (vector bool int)__builtin_s390_vsra( @@ -5871,12 +7017,14 @@ vec_sral(vector unsigned int __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_sral(vector unsigned int __a, vector unsigned short __b) { return (vector unsigned int)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_sral(vector unsigned int __a, vector unsigned int __b) { return (vector unsigned int)__builtin_s390_vsra( @@ -5889,30 +7037,35 @@ vec_sral(vector signed long long __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_sral(vector signed long long __a, vector unsigned short __b) { return (vector signed long long)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_sral(vector signed long long __a, vector unsigned int __b) { return (vector signed long long)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool long long vec_sral(vector bool long long __a, vector unsigned char __b) { return (vector bool long long)__builtin_s390_vsra( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool long long vec_sral(vector bool long long __a, vector unsigned short __b) { return (vector bool long long)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool long long vec_sral(vector bool long long __a, vector unsigned int __b) { return (vector bool long long)__builtin_s390_vsra( @@ -5925,12 +7078,14 @@ vec_sral(vector unsigned long long __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_sral(vector unsigned long long __a, vector unsigned short __b) { return (vector unsigned long long)__builtin_s390_vsra( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_sral(vector unsigned long long __a, vector unsigned int __b) { return (vector unsigned long long)__builtin_s390_vsra( @@ -6033,6 +7188,20 @@ vec_srab(vector unsigned long long __a, vector unsigned long long __b) { (vector unsigned char)__a, (vector unsigned char)__b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_srab(vector float __a, vector signed int __b) { + return (vector float)__builtin_s390_vsrab( + (vector unsigned char)__a, (vector unsigned char)__b); +} + +static inline __ATTRS_o_ai vector float +vec_srab(vector float __a, vector unsigned int __b) { + return (vector float)__builtin_s390_vsrab( + (vector unsigned char)__a, (vector unsigned char)__b); +} +#endif + static inline __ATTRS_o_ai vector double vec_srab(vector double __a, vector signed long long __b) { return (vector double)__builtin_s390_vsrab( @@ -6053,30 +7222,35 @@ vec_srl(vector signed char __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_srl(vector signed char __a, vector unsigned short __b) { return (vector signed char)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_srl(vector signed char __a, vector unsigned int __b) { return (vector signed char)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool char vec_srl(vector bool char __a, vector unsigned char __b) { return (vector bool char)__builtin_s390_vsrl( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool char vec_srl(vector bool char __a, vector unsigned short __b) { return (vector bool char)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool char vec_srl(vector bool char __a, vector unsigned int __b) { return (vector bool char)__builtin_s390_vsrl( @@ -6088,11 +7262,13 @@ vec_srl(vector unsigned char __a, vector unsigned char __b) { return __builtin_s390_vsrl(__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_srl(vector unsigned char __a, vector unsigned short __b) { return __builtin_s390_vsrl(__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_srl(vector unsigned char __a, vector unsigned int __b) { return __builtin_s390_vsrl(__a, (vector unsigned char)__b); @@ -6104,30 +7280,35 @@ vec_srl(vector signed short __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_srl(vector signed short __a, vector unsigned short __b) { return (vector signed short)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_srl(vector signed short __a, vector unsigned int __b) { return (vector signed short)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool short vec_srl(vector bool short __a, vector unsigned char __b) { return (vector bool short)__builtin_s390_vsrl( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool short vec_srl(vector bool short __a, vector unsigned short __b) { return (vector bool short)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool short vec_srl(vector bool short __a, vector unsigned int __b) { return (vector bool short)__builtin_s390_vsrl( @@ -6140,12 +7321,14 @@ vec_srl(vector unsigned short __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_srl(vector unsigned short __a, vector unsigned short __b) { return (vector unsigned short)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_srl(vector unsigned short __a, vector unsigned int __b) { return (vector unsigned short)__builtin_s390_vsrl( @@ -6158,30 +7341,35 @@ vec_srl(vector signed int __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_srl(vector signed int __a, vector unsigned short __b) { return (vector signed int)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_srl(vector signed int __a, vector unsigned int __b) { return (vector signed int)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool int vec_srl(vector bool int __a, vector unsigned char __b) { return (vector bool int)__builtin_s390_vsrl( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool int vec_srl(vector bool int __a, vector unsigned short __b) { return (vector bool int)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool int vec_srl(vector bool int __a, vector unsigned int __b) { return (vector bool int)__builtin_s390_vsrl( @@ -6194,12 +7382,14 @@ vec_srl(vector unsigned int __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_srl(vector unsigned int __a, vector unsigned short __b) { return (vector unsigned int)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_srl(vector unsigned int __a, vector unsigned int __b) { return (vector unsigned int)__builtin_s390_vsrl( @@ -6212,30 +7402,35 @@ vec_srl(vector signed long long __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_srl(vector signed long long __a, vector unsigned short __b) { return (vector signed long long)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_srl(vector signed long long __a, vector unsigned int __b) { return (vector signed long long)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool long long vec_srl(vector bool long long __a, vector unsigned char __b) { return (vector bool long long)__builtin_s390_vsrl( (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool long long vec_srl(vector bool long long __a, vector unsigned short __b) { return (vector bool long long)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector bool long long vec_srl(vector bool long long __a, vector unsigned int __b) { return (vector bool long long)__builtin_s390_vsrl( @@ -6248,12 +7443,14 @@ vec_srl(vector unsigned long long __a, vector unsigned char __b) { (vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_srl(vector unsigned long long __a, vector unsigned short __b) { return (vector unsigned long long)__builtin_s390_vsrl( (vector unsigned char)__a, (vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_srl(vector unsigned long long __a, vector unsigned int __b) { return (vector unsigned long long)__builtin_s390_vsrl( @@ -6356,6 +7553,20 @@ vec_srb(vector unsigned long long __a, vector unsigned long long __b) { (vector unsigned char)__a, (vector unsigned char)__b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_srb(vector float __a, vector signed int __b) { + return (vector float)__builtin_s390_vsrlb( + (vector unsigned char)__a, (vector unsigned char)__b); +} + +static inline __ATTRS_o_ai vector float +vec_srb(vector float __a, vector unsigned int __b) { + return (vector float)__builtin_s390_vsrlb( + (vector unsigned char)__a, (vector unsigned char)__b); +} +#endif + static inline __ATTRS_o_ai vector double vec_srb(vector double __a, vector signed long long __b) { return (vector double)__builtin_s390_vsrlb( @@ -6390,6 +7601,13 @@ vec_abs(vector signed long long __a) { return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed long long)0)); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_abs(vector float __a) { + return __builtin_s390_vflpsb(__a); +} +#endif + static inline __ATTRS_o_ai vector double vec_abs(vector double __a) { return __builtin_s390_vflpdb(__a); @@ -6397,7 +7615,14 @@ vec_abs(vector double __a) { /*-- vec_nabs ---------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_nabs(vector float __a) { + return __builtin_s390_vflnsb(__a); +} +#endif + +static inline __ATTRS_o_ai vector double vec_nabs(vector double __a) { return __builtin_s390_vflndb(__a); } @@ -6409,12 +7634,14 @@ vec_max(vector signed char __a, vector signed char __b) { return vec_sel(__b, __a, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_max(vector signed char __a, vector bool char __b) { vector signed char __bc = (vector signed char)__b; return vec_sel(__bc, __a, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_max(vector bool char __a, vector signed char __b) { vector signed char __ac = (vector signed char)__a; @@ -6426,12 +7653,14 @@ vec_max(vector unsigned char __a, vector unsigned char __b) { return vec_sel(__b, __a, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_max(vector unsigned char __a, vector bool char __b) { vector unsigned char __bc = (vector unsigned char)__b; return vec_sel(__bc, __a, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_max(vector bool char __a, vector unsigned char __b) { vector unsigned char __ac = (vector unsigned char)__a; @@ -6443,12 +7672,14 @@ vec_max(vector signed short __a, vector signed short __b) { return vec_sel(__b, __a, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_max(vector signed short __a, vector bool short __b) { vector signed short __bc = (vector signed short)__b; return vec_sel(__bc, __a, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_max(vector bool short __a, vector signed short __b) { vector signed short __ac = (vector signed short)__a; @@ -6460,12 +7691,14 @@ vec_max(vector unsigned short __a, vector unsigned short __b) { return vec_sel(__b, __a, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_max(vector unsigned short __a, vector bool short __b) { vector unsigned short __bc = (vector unsigned short)__b; return vec_sel(__bc, __a, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_max(vector bool short __a, vector unsigned short __b) { vector unsigned short __ac = (vector unsigned short)__a; @@ -6477,12 +7710,14 @@ vec_max(vector signed int __a, vector signed int __b) { return vec_sel(__b, __a, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_max(vector signed int __a, vector bool int __b) { vector signed int __bc = (vector signed int)__b; return vec_sel(__bc, __a, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_max(vector bool int __a, vector signed int __b) { vector signed int __ac = (vector signed int)__a; @@ -6494,12 +7729,14 @@ vec_max(vector unsigned int __a, vector unsigned int __b) { return vec_sel(__b, __a, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_max(vector unsigned int __a, vector bool int __b) { vector unsigned int __bc = (vector unsigned int)__b; return vec_sel(__bc, __a, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_max(vector bool int __a, vector unsigned int __b) { vector unsigned int __ac = (vector unsigned int)__a; @@ -6511,12 +7748,14 @@ vec_max(vector signed long long __a, vector signed long long __b) { return vec_sel(__b, __a, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_max(vector signed long long __a, vector bool long long __b) { vector signed long long __bc = (vector signed long long)__b; return vec_sel(__bc, __a, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_max(vector bool long long __a, vector signed long long __b) { vector signed long long __ac = (vector signed long long)__a; @@ -6528,21 +7767,34 @@ vec_max(vector unsigned long long __a, vector unsigned long long __b) { return vec_sel(__b, __a, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_max(vector unsigned long long __a, vector bool long long __b) { vector unsigned long long __bc = (vector unsigned long long)__b; return vec_sel(__bc, __a, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_max(vector bool long long __a, vector unsigned long long __b) { vector unsigned long long __ac = (vector unsigned long long)__a; return vec_sel(__b, __ac, vec_cmpgt(__ac, __b)); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_max(vector float __a, vector float __b) { + return __builtin_s390_vfmaxsb(__a, __b, 0); +} +#endif + static inline __ATTRS_o_ai vector double vec_max(vector double __a, vector double __b) { +#if __ARCH__ >= 12 + return __builtin_s390_vfmaxdb(__a, __b, 0); +#else return vec_sel(__b, __a, vec_cmpgt(__a, __b)); +#endif } /*-- vec_min ----------------------------------------------------------------*/ @@ -6552,12 +7804,14 @@ vec_min(vector signed char __a, vector signed char __b) { return vec_sel(__a, __b, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_min(vector signed char __a, vector bool char __b) { vector signed char __bc = (vector signed char)__b; return vec_sel(__a, __bc, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed char vec_min(vector bool char __a, vector signed char __b) { vector signed char __ac = (vector signed char)__a; @@ -6569,12 +7823,14 @@ vec_min(vector unsigned char __a, vector unsigned char __b) { return vec_sel(__a, __b, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_min(vector unsigned char __a, vector bool char __b) { vector unsigned char __bc = (vector unsigned char)__b; return vec_sel(__a, __bc, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned char vec_min(vector bool char __a, vector unsigned char __b) { vector unsigned char __ac = (vector unsigned char)__a; @@ -6586,12 +7842,14 @@ vec_min(vector signed short __a, vector signed short __b) { return vec_sel(__a, __b, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_min(vector signed short __a, vector bool short __b) { vector signed short __bc = (vector signed short)__b; return vec_sel(__a, __bc, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed short vec_min(vector bool short __a, vector signed short __b) { vector signed short __ac = (vector signed short)__a; @@ -6603,12 +7861,14 @@ vec_min(vector unsigned short __a, vector unsigned short __b) { return vec_sel(__a, __b, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_min(vector unsigned short __a, vector bool short __b) { vector unsigned short __bc = (vector unsigned short)__b; return vec_sel(__a, __bc, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned short vec_min(vector bool short __a, vector unsigned short __b) { vector unsigned short __ac = (vector unsigned short)__a; @@ -6620,12 +7880,14 @@ vec_min(vector signed int __a, vector signed int __b) { return vec_sel(__a, __b, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_min(vector signed int __a, vector bool int __b) { vector signed int __bc = (vector signed int)__b; return vec_sel(__a, __bc, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed int vec_min(vector bool int __a, vector signed int __b) { vector signed int __ac = (vector signed int)__a; @@ -6637,12 +7899,14 @@ vec_min(vector unsigned int __a, vector unsigned int __b) { return vec_sel(__a, __b, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_min(vector unsigned int __a, vector bool int __b) { vector unsigned int __bc = (vector unsigned int)__b; return vec_sel(__a, __bc, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned int vec_min(vector bool int __a, vector unsigned int __b) { vector unsigned int __ac = (vector unsigned int)__a; @@ -6654,12 +7918,14 @@ vec_min(vector signed long long __a, vector signed long long __b) { return vec_sel(__a, __b, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_min(vector signed long long __a, vector bool long long __b) { vector signed long long __bc = (vector signed long long)__b; return vec_sel(__a, __bc, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_min(vector bool long long __a, vector signed long long __b) { vector signed long long __ac = (vector signed long long)__a; @@ -6671,21 +7937,34 @@ vec_min(vector unsigned long long __a, vector unsigned long long __b) { return vec_sel(__a, __b, vec_cmpgt(__a, __b)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_min(vector unsigned long long __a, vector bool long long __b) { vector unsigned long long __bc = (vector unsigned long long)__b; return vec_sel(__a, __bc, vec_cmpgt(__a, __bc)); } +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_min(vector bool long long __a, vector unsigned long long __b) { vector unsigned long long __ac = (vector unsigned long long)__a; return vec_sel(__ac, __b, vec_cmpgt(__ac, __b)); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_min(vector float __a, vector float __b) { + return __builtin_s390_vfminsb(__a, __b, 0); +} +#endif + static inline __ATTRS_o_ai vector double vec_min(vector double __a, vector double __b) { +#if __ARCH__ >= 12 + return __builtin_s390_vfmindb(__a, __b, 0); +#else return vec_sel(__a, __b, vec_cmpgt(__a, __b)); +#endif } /*-- vec_add_u128 -----------------------------------------------------------*/ @@ -7126,6 +8405,13 @@ vec_mulo(vector unsigned int __a, vector unsigned int __b) { return __builtin_s390_vmlof(__a, __b); } +/*-- vec_msum_u128 ----------------------------------------------------------*/ + +#if __ARCH__ >= 12 +#define vec_msum_u128(X, Y, Z, W) \ + ((vector unsigned char)__builtin_s390_vmslg((X), (Y), (Z), (W))); +#endif + /*-- vec_sub_u128 -----------------------------------------------------------*/ static inline __ATTRS_ai vector unsigned char @@ -7263,6 +8549,14 @@ vec_test_mask(vector unsigned long long __a, vector unsigned long long __b) { (vector unsigned char)__b); } +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai int +vec_test_mask(vector float __a, vector unsigned int __b) { + return __builtin_s390_vtm((vector unsigned char)__a, + (vector unsigned char)__b); +} +#endif + static inline __ATTRS_o_ai int vec_test_mask(vector double __a, vector unsigned long long __b) { return __builtin_s390_vtm((vector unsigned char)__a, @@ -7271,27 +8565,77 @@ vec_test_mask(vector double __a, vector unsigned long long __b) { /*-- vec_madd ---------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_madd(vector float __a, vector float __b, vector float __c) { + return __builtin_s390_vfmasb(__a, __b, __c); +} +#endif + +static inline __ATTRS_o_ai vector double vec_madd(vector double __a, vector double __b, vector double __c) { return __builtin_s390_vfmadb(__a, __b, __c); } /*-- vec_msub ---------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_msub(vector float __a, vector float __b, vector float __c) { + return __builtin_s390_vfmssb(__a, __b, __c); +} +#endif + +static inline __ATTRS_o_ai vector double vec_msub(vector double __a, vector double __b, vector double __c) { return __builtin_s390_vfmsdb(__a, __b, __c); } +/*-- vec_nmadd ---------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_nmadd(vector float __a, vector float __b, vector float __c) { + return __builtin_s390_vfnmasb(__a, __b, __c); +} + +static inline __ATTRS_o_ai vector double +vec_nmadd(vector double __a, vector double __b, vector double __c) { + return __builtin_s390_vfnmadb(__a, __b, __c); +} +#endif + +/*-- vec_nmsub ---------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_nmsub(vector float __a, vector float __b, vector float __c) { + return __builtin_s390_vfnmssb(__a, __b, __c); +} + +static inline __ATTRS_o_ai vector double +vec_nmsub(vector double __a, vector double __b, vector double __c) { + return __builtin_s390_vfnmsdb(__a, __b, __c); +} +#endif + /*-- vec_sqrt ---------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_sqrt(vector float __a) { + return __builtin_s390_vfsqsb(__a); +} +#endif + +static inline __ATTRS_o_ai vector double vec_sqrt(vector double __a) { return __builtin_s390_vfsqdb(__a); } /*-- vec_ld2f ---------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_ai vector double vec_ld2f(const float *__ptr) { typedef float __v2f32 __attribute__((__vector_size__(8))); @@ -7300,6 +8644,7 @@ vec_ld2f(const float *__ptr) { /*-- vec_st2f ---------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_ai void vec_st2f(vector double __a, float *__ptr) { typedef float __v2f32 __attribute__((__vector_size__(8))); @@ -7308,6 +8653,7 @@ vec_st2f(vector double __a, float *__ptr) { /*-- vec_ctd ----------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai vector double vec_ctd(vector signed long long __a, int __b) __constant_range(__b, 0, 31) { @@ -7316,6 +8662,7 @@ vec_ctd(vector signed long long __a, int __b) return __conv; } +// This prototype is deprecated. static inline __ATTRS_o_ai vector double vec_ctd(vector unsigned long long __a, int __b) __constant_range(__b, 0, 31) { @@ -7326,6 +8673,7 @@ vec_ctd(vector unsigned long long __a, int __b) /*-- vec_ctsl ---------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai vector signed long long vec_ctsl(vector double __a, int __b) __constant_range(__b, 0, 31) { @@ -7335,6 +8683,7 @@ vec_ctsl(vector double __a, int __b) /*-- vec_ctul ---------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai vector unsigned long long vec_ctul(vector double __a, int __b) __constant_range(__b, 0, 31) { @@ -7342,16 +8691,79 @@ vec_ctul(vector double __a, int __b) return __builtin_convertvector(__a, vector unsigned long long); } -/*-- vec_roundp -------------------------------------------------------------*/ +/*-- vec_doublee ------------------------------------------------------------*/ +#if __ARCH__ >= 12 static inline __ATTRS_ai vector double +vec_doublee(vector float __a) { + typedef float __v2f32 __attribute__((__vector_size__(8))); + __v2f32 __pack = __builtin_shufflevector(__a, __a, 0, 2); + return __builtin_convertvector(__pack, vector double); +} +#endif + +/*-- vec_floate -------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_ai vector float +vec_floate(vector double __a) { + typedef float __v2f32 __attribute__((__vector_size__(8))); + __v2f32 __pack = __builtin_convertvector(__a, __v2f32); + return __builtin_shufflevector(__pack, __pack, 0, -1, 1, -1); +} +#endif + +/*-- vec_double -------------------------------------------------------------*/ + +static inline __ATTRS_o_ai vector double +vec_double(vector signed long long __a) { + return __builtin_convertvector(__a, vector double); +} + +static inline __ATTRS_o_ai vector double +vec_double(vector unsigned long long __a) { + return __builtin_convertvector(__a, vector double); +} + +/*-- vec_signed -------------------------------------------------------------*/ + +static inline __ATTRS_o_ai vector signed long long +vec_signed(vector double __a) { + return __builtin_convertvector(__a, vector signed long long); +} + +/*-- vec_unsigned -----------------------------------------------------------*/ + +static inline __ATTRS_o_ai vector unsigned long long +vec_unsigned(vector double __a) { + return __builtin_convertvector(__a, vector unsigned long long); +} + +/*-- vec_roundp -------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_roundp(vector float __a) { + return __builtin_s390_vfisb(__a, 4, 6); +} +#endif + +static inline __ATTRS_o_ai vector double vec_roundp(vector double __a) { return __builtin_s390_vfidb(__a, 4, 6); } /*-- vec_ceil ---------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_ceil(vector float __a) { + // On this platform, vec_ceil never triggers the IEEE-inexact exception. + return __builtin_s390_vfisb(__a, 4, 6); +} +#endif + +static inline __ATTRS_o_ai vector double vec_ceil(vector double __a) { // On this platform, vec_ceil never triggers the IEEE-inexact exception. return __builtin_s390_vfidb(__a, 4, 6); @@ -7359,14 +8771,29 @@ vec_ceil(vector double __a) { /*-- vec_roundm -------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_roundm(vector float __a) { + return __builtin_s390_vfisb(__a, 4, 7); +} +#endif + +static inline __ATTRS_o_ai vector double vec_roundm(vector double __a) { return __builtin_s390_vfidb(__a, 4, 7); } /*-- vec_floor --------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_floor(vector float __a) { + // On this platform, vec_floor never triggers the IEEE-inexact exception. + return __builtin_s390_vfisb(__a, 4, 7); +} +#endif + +static inline __ATTRS_o_ai vector double vec_floor(vector double __a) { // On this platform, vec_floor never triggers the IEEE-inexact exception. return __builtin_s390_vfidb(__a, 4, 7); @@ -7374,14 +8801,29 @@ vec_floor(vector double __a) { /*-- vec_roundz -------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_roundz(vector float __a) { + return __builtin_s390_vfisb(__a, 4, 5); +} +#endif + +static inline __ATTRS_o_ai vector double vec_roundz(vector double __a) { return __builtin_s390_vfidb(__a, 4, 5); } /*-- vec_trunc --------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_trunc(vector float __a) { + // On this platform, vec_trunc never triggers the IEEE-inexact exception. + return __builtin_s390_vfisb(__a, 4, 5); +} +#endif + +static inline __ATTRS_o_ai vector double vec_trunc(vector double __a) { // On this platform, vec_trunc never triggers the IEEE-inexact exception. return __builtin_s390_vfidb(__a, 4, 5); @@ -7389,22 +8831,104 @@ vec_trunc(vector double __a) { /*-- vec_roundc -------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_roundc(vector float __a) { + return __builtin_s390_vfisb(__a, 4, 0); +} +#endif + +static inline __ATTRS_o_ai vector double vec_roundc(vector double __a) { return __builtin_s390_vfidb(__a, 4, 0); } +/*-- vec_rint ---------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_rint(vector float __a) { + // vec_rint may trigger the IEEE-inexact exception. + return __builtin_s390_vfisb(__a, 0, 0); +} +#endif + +static inline __ATTRS_o_ai vector double +vec_rint(vector double __a) { + // vec_rint may trigger the IEEE-inexact exception. + return __builtin_s390_vfidb(__a, 0, 0); +} + /*-- vec_round --------------------------------------------------------------*/ -static inline __ATTRS_ai vector double +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_round(vector float __a) { + return __builtin_s390_vfisb(__a, 4, 4); +} +#endif + +static inline __ATTRS_o_ai vector double vec_round(vector double __a) { return __builtin_s390_vfidb(__a, 4, 4); } /*-- vec_fp_test_data_class -------------------------------------------------*/ +#if __ARCH__ >= 12 +extern __ATTRS_o vector bool int +vec_fp_test_data_class(vector float __a, int __b, int *__c) + __constant_range(__b, 0, 4095); + +extern __ATTRS_o vector bool long long +vec_fp_test_data_class(vector double __a, int __b, int *__c) + __constant_range(__b, 0, 4095); + +#define vec_fp_test_data_class(X, Y, Z) \ + ((__typeof__((vec_fp_test_data_class)((X), (Y), (Z)))) \ + __extension__ ({ \ + vector unsigned char __res; \ + vector unsigned char __x = (vector unsigned char)(X); \ + int *__z = (Z); \ + switch (sizeof ((X)[0])) { \ + case 4: __res = (vector unsigned char) \ + __builtin_s390_vftcisb((vector float)__x, (Y), __z); \ + break; \ + default: __res = (vector unsigned char) \ + __builtin_s390_vftcidb((vector double)__x, (Y), __z); \ + break; \ + } __res; })) +#else #define vec_fp_test_data_class(X, Y, Z) \ ((vector bool long long)__builtin_s390_vftcidb((X), (Y), (Z))) +#endif + +#define __VEC_CLASS_FP_ZERO_P (1 << 11) +#define __VEC_CLASS_FP_ZERO_N (1 << 10) +#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P | __VEC_CLASS_FP_ZERO_N) +#define __VEC_CLASS_FP_NORMAL_P (1 << 9) +#define __VEC_CLASS_FP_NORMAL_N (1 << 8) +#define __VEC_CLASS_FP_NORMAL (__VEC_CLASS_FP_NORMAL_P | \ + __VEC_CLASS_FP_NORMAL_N) +#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 7) +#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 6) +#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \ + __VEC_CLASS_FP_SUBNORMAL_N) +#define __VEC_CLASS_FP_INFINITY_P (1 << 5) +#define __VEC_CLASS_FP_INFINITY_N (1 << 4) +#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P | \ + __VEC_CLASS_FP_INFINITY_N) +#define __VEC_CLASS_FP_QNAN_P (1 << 3) +#define __VEC_CLASS_FP_QNAN_N (1 << 2) +#define __VEC_CLASS_FP_QNAN (__VEC_CLASS_FP_QNAN_P | __VEC_CLASS_FP_QNAN_N) +#define __VEC_CLASS_FP_SNAN_P (1 << 1) +#define __VEC_CLASS_FP_SNAN_N (1 << 0) +#define __VEC_CLASS_FP_SNAN (__VEC_CLASS_FP_SNAN_P | __VEC_CLASS_FP_SNAN_N) +#define __VEC_CLASS_FP_NAN (__VEC_CLASS_FP_QNAN | __VEC_CLASS_FP_SNAN) +#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN | \ + __VEC_CLASS_FP_SUBNORMAL | \ + __VEC_CLASS_FP_ZERO | \ + __VEC_CLASS_FP_INFINITY) /*-- vec_cp_until_zero ------------------------------------------------------*/ diff --git a/contrib/llvm/tools/clang/lib/Headers/x86intrin.h b/contrib/llvm/tools/clang/lib/Headers/x86intrin.h index 81a404f..31ee7b8 100644 --- a/contrib/llvm/tools/clang/lib/Headers/x86intrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/x86intrin.h @@ -72,6 +72,10 @@ #include <tbmintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LWP__) +#include <lwpintrin.h> +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__) #include <f16cintrin.h> #endif @@ -80,6 +84,8 @@ #include <mwaitxintrin.h> #endif -/* FIXME: LWP */ +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLZERO__) +#include <clzerointrin.h> +#endif #endif /* __X86INTRIN_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/xmmintrin.h b/contrib/llvm/tools/clang/lib/Headers/xmmintrin.h index dc31b85..bbc2117 100644 --- a/contrib/llvm/tools/clang/lib/Headers/xmmintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/xmmintrin.h @@ -2067,7 +2067,7 @@ _mm_storer_ps(float *__p, __m128 __a) /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will /// be generated. \n /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will -/// be generated. +/// be generated. #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) #endif @@ -2099,7 +2099,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a) /// /// \param __p /// A pointer to a 128-bit aligned memory location that will receive the -/// integer values. +/// single-precision floating-point values. /// \param __a /// A 128-bit vector of [4 x float] containing the values to be moved. static __inline__ void __DEFAULT_FN_ATTRS @@ -2133,7 +2133,7 @@ void _mm_sfence(void); /// \headerfile <x86intrin.h> /// /// \code -/// void _mm_extract_pi(__m64 a, int n); +/// int _mm_extract_pi16(__m64 a, int n); /// \endcode /// /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. @@ -2157,7 +2157,7 @@ void _mm_sfence(void); /// \headerfile <x86intrin.h> /// /// \code -/// void _mm_insert_pi(__m64 a, int d, int n); +/// __m64 _mm_insert_pi16(__m64 a, int d, int n); /// \endcode /// /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. @@ -2331,8 +2331,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b) /// \brief Conditionally copies the values from each 8-bit element in the first /// 64-bit integer vector operand to the specified memory location, as /// specified by the most significant bit in the corresponding element in the -/// second 64-bit integer vector operand. To minimize caching, the data is -/// flagged as non-temporal (unlikely to be used again soon). +/// second 64-bit integer vector operand. +/// +/// To minimize caching, the data is flagged as non-temporal +/// (unlikely to be used again soon). /// /// \headerfile <x86intrin.h> /// @@ -2435,17 +2437,17 @@ extern "C" { /// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK(). -/// </li> +/// </li> /// <li> /// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper /// _MM_GET_ROUNDING_MODE(x) where x is one of these macros. /// </li> -/// <li> +/// <li> /// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE(). /// </li> -/// <li> +/// <li> /// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper /// _MM_GET_DENORMALS_ZERO_MODE(). @@ -2468,11 +2470,11 @@ extern "C" { unsigned int _mm_getcsr(void); /// \brief Sets the MXCSR register with the 32-bit unsigned integer value. -/// +/// /// There are several groups of macros associated with this intrinsic, /// including: /// <ul> -/// <li> +/// <li> /// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, /// _MM_EXCEPT_INEXACT. There is a convenience wrapper @@ -2517,7 +2519,7 @@ unsigned int _mm_getcsr(void); /// /// \param __i /// A 32-bit unsigned integer value to be written to the MXCSR register. -void _mm_setcsr(unsigned int); +void _mm_setcsr(unsigned int __i); #if defined(__cplusplus) } // extern "C" @@ -2540,7 +2542,7 @@ void _mm_setcsr(unsigned int); /// A 128-bit vector of [4 x float]. /// \param mask /// An immediate value containing an 8-bit value specifying which elements to -/// copy from \ a and \a b. \n +/// copy from \a a and \a b. \n /// Bits [3:0] specify the values copied from operand \a a. \n /// Bits [7:4] specify the values copied from operand \a b. \n /// The destinations within the 128-bit destination are assigned values as @@ -2678,8 +2680,7 @@ _mm_movelh_ps(__m128 __a, __m128 __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> -/// instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. /// /// \param __a /// A 64-bit vector of [4 x i16]. The elements of the destination are copied @@ -2709,8 +2710,7 @@ _mm_cvtpi16_ps(__m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> -/// instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. /// /// \param __a /// A 64-bit vector of 16-bit unsigned integer values. The elements of the @@ -2739,8 +2739,7 @@ _mm_cvtpu16_ps(__m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> -/// instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. /// /// \param __a /// A 64-bit vector of [8 x i8]. The elements of the destination are copied @@ -2764,8 +2763,7 @@ _mm_cvtpi8_ps(__m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> -/// instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. /// /// \param __a /// A 64-bit vector of unsigned 8-bit integer values. The elements of the @@ -2789,8 +2787,7 @@ _mm_cvtpu8_ps(__m64 __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c> -/// instruction. +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction. /// /// \param __a /// A 64-bit vector of [2 x i32]. The lower elements of the destination are @@ -2815,16 +2812,16 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) /// \brief Converts each single-precision floating-point element of a 128-bit /// floating-point vector of [4 x float] into a 16-bit signed integer, and -/// packs the results into a 64-bit integer vector of [4 x i16]. If the -/// floating-point element is NaN or infinity, or if the floating-point -/// element is greater than 0x7FFFFFFF or less than -0x8000, it is converted -/// to 0x8000. Otherwise if the floating-point element is greater than -/// 0x7FFF, it is converted to 0x7FFF. +/// packs the results into a 64-bit integer vector of [4 x i16]. +/// +/// If the floating-point element is NaN or infinity, or if the +/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000, +/// it is converted to 0x8000. Otherwise if the floating-point element is +/// greater than 0x7FFF, it is converted to 0x7FFF. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c> -/// instruction. +/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. /// /// \param __a /// A 128-bit floating-point vector of [4 x float]. @@ -2845,16 +2842,16 @@ _mm_cvtps_pi16(__m128 __a) /// \brief Converts each single-precision floating-point element of a 128-bit /// floating-point vector of [4 x float] into an 8-bit signed integer, and /// packs the results into the lower 32 bits of a 64-bit integer vector of -/// [8 x i8]. The upper 32 bits of the vector are set to 0. If the -/// floating-point element is NaN or infinity, or if the floating-point -/// element is greater than 0x7FFFFFFF or less than -0x80, it is converted -/// to 0x80. Otherwise if the floating-point element is greater than 0x7F, -/// it is converted to 0x7F. +/// [8 x i8]. The upper 32 bits of the vector are set to 0. +/// +/// If the floating-point element is NaN or infinity, or if the +/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it +/// is converted to 0x80. Otherwise if the floating-point element is greater +/// than 0x7F, it is converted to 0x7F. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c> -/// instruction. +/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction. /// /// \param __a /// 128-bit floating-point vector of [4 x float]. diff --git a/contrib/llvm/tools/clang/lib/Headers/xopintrin.h b/contrib/llvm/tools/clang/lib/Headers/xopintrin.h index bdf0cec..4a34f77 100644 --- a/contrib/llvm/tools/clang/lib/Headers/xopintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/xopintrin.h @@ -198,13 +198,13 @@ _mm_hsubq_epi32(__m128i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C); + return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C)); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C); + return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C)); } static __inline__ __m128i __DEFAULT_FN_ATTRS |