diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Headers')
23 files changed, 6493 insertions, 396 deletions
diff --git a/contrib/llvm/tools/clang/lib/Headers/Intrin.h b/contrib/llvm/tools/clang/lib/Headers/Intrin.h index 84bc430..727a55e 100644 --- a/contrib/llvm/tools/clang/lib/Headers/Intrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/Intrin.h @@ -289,6 +289,7 @@ void _WriteBarrier(void); unsigned __int32 xbegin(void); void _xend(void); static __inline__ +#define _XCR_XFEATURE_ENABLED_MASK 0 unsigned __int64 __cdecl _xgetbv(unsigned int); void __cdecl _xrstor(void const *, unsigned __int64); void __cdecl _xsave(void *, unsigned __int64); @@ -780,17 +781,17 @@ _InterlockedCompareExchange64(__int64 volatile *_Destination, \*----------------------------------------------------------------------------*/ #if defined(__i386__) || defined(__x86_64__) static __inline__ void __attribute__((__always_inline__, __nodebug__)) -__attribute__((deprecated("use other intrinsics or C++11 atomics instead"))) +__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void) { __asm__ volatile ("" : : : "memory"); } static __inline__ void __attribute__((__always_inline__, __nodebug__)) -__attribute__((deprecated("use other intrinsics or C++11 atomics instead"))) +__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void) { __asm__ volatile ("" : : : "memory"); } static __inline__ void __attribute__((__always_inline__, __nodebug__)) -__attribute__((deprecated("use other intrinsics or C++11 atomics instead"))) +__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void) { __asm__ volatile ("" : : : "memory"); } @@ -943,14 +944,14 @@ __readmsr(unsigned long __register) { return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax; } -static __inline__ unsigned long __attribute__((always_inline, __nodebug__)) +static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) __readcr3(void) { unsigned long __cr3_val; __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory"); return __cr3_val; } -static __inline__ void __attribute__((always_inline, __nodebug__)) +static __inline__ void __attribute__((__always_inline__, __nodebug__)) __writecr3(unsigned int __cr3_val) { __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory"); } diff --git a/contrib/llvm/tools/clang/lib/Headers/__stddef_max_align_t.h b/contrib/llvm/tools/clang/lib/Headers/__stddef_max_align_t.h index a06f412..1e10ca9 100644 --- a/contrib/llvm/tools/clang/lib/Headers/__stddef_max_align_t.h +++ b/contrib/llvm/tools/clang/lib/Headers/__stddef_max_align_t.h @@ -26,15 +26,18 @@ #ifndef __CLANG_MAX_ALIGN_T_DEFINED #define __CLANG_MAX_ALIGN_T_DEFINED -#ifndef _MSC_VER +#if defined(_MSC_VER) +typedef double max_align_t; +#elif defined(__APPLE__) +typedef long double max_align_t; +#else +// Define 'max_align_t' to match the GCC definition. typedef struct { long long __clang_max_align_nonce1 __attribute__((__aligned__(__alignof__(long long)))); long double __clang_max_align_nonce2 __attribute__((__aligned__(__alignof__(long double)))); } max_align_t; -#else -typedef double max_align_t; #endif #endif diff --git a/contrib/llvm/tools/clang/lib/Headers/altivec.h b/contrib/llvm/tools/clang/lib/Headers/altivec.h index 0ac0841..1f8c831 100644 --- a/contrib/llvm/tools/clang/lib/Headers/altivec.h +++ b/contrib/llvm/tools/clang/lib/Headers/altivec.h @@ -73,6 +73,18 @@ vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c); static vector float __ATTRS_o_ai vec_perm(vector float __a, vector float __b, vector unsigned char __c); +#ifdef __VSX__ +static vector long long __ATTRS_o_ai +vec_perm(vector long long __a, vector long long __b, vector unsigned char __c); + +static vector unsigned long long __ATTRS_o_ai +vec_perm(vector unsigned long long __a, vector unsigned long long __b, + vector unsigned char __c); + +static vector double __ATTRS_o_ai +vec_perm(vector double __a, vector double __b, vector unsigned char __c); +#endif + static vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b); @@ -245,6 +257,20 @@ vec_add(vector unsigned int __a, vector bool int __b) return __a + (vector unsigned int)__b; } +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static vector signed __int128 __ATTRS_o_ai +vec_add(vector signed __int128 __a, vector signed __int128 __b) +{ + return __a + __b; +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_add(vector unsigned __int128 __a, vector unsigned __int128 __b) +{ + return __a + __b; +} +#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__) + static vector float __ATTRS_o_ai vec_add(vector float __a, vector float __b) { @@ -383,12 +409,24 @@ vec_vaddfp(vector float __a, vector float __b) /* vec_addc */ -static vector unsigned int __attribute__((__always_inline__)) +static vector unsigned int __ATTRS_o_ai vec_addc(vector unsigned int __a, vector unsigned int __b) { return __builtin_altivec_vaddcuw(__a, __b); } +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static vector signed __int128 __ATTRS_o_ai +vec_addc(vector signed __int128 __a, vector signed __int128 __b) { + return __builtin_altivec_vaddcuq(__a, __b); +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_addc(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return __builtin_altivec_vaddcuq(__a, __b); +} +#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__) + /* vec_vaddcuw */ static vector unsigned int __attribute__((__always_inline__)) @@ -627,6 +665,64 @@ vec_vadduws(vector unsigned int __a, vector bool int __b) return __builtin_altivec_vadduws(__a, (vector unsigned int)__b); } +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +/* vec_vadduqm */ + +static vector signed __int128 __ATTRS_o_ai +vec_vadduqm(vector signed __int128 __a, vector signed __int128 __b) +{ + return __a + __b; +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_vadduqm(vector unsigned __int128 __a, vector unsigned __int128 __b) +{ + return __a + __b; +} + +/* vec_vaddeuqm */ + +static vector signed __int128 __ATTRS_o_ai +vec_vaddeuqm(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) { + return __builtin_altivec_vaddeuqm(__a, __b, __c); +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_vaddeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) { + return __builtin_altivec_vaddeuqm(__a, __b, __c); +} + +/* vec_vaddcuq */ + +static vector signed __int128 __ATTRS_o_ai +vec_vaddcuq(vector signed __int128 __a, vector signed __int128 __b) +{ + return __builtin_altivec_vaddcuq(__a, __b); +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_vaddcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) +{ + return __builtin_altivec_vaddcuq(__a, __b); +} + +/* vec_vaddecuq */ + +static vector signed __int128 __ATTRS_o_ai +vec_vaddecuq(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) { + return __builtin_altivec_vaddecuq(__a, __b, __c); +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_vaddecuq(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) { + return __builtin_altivec_vaddecuq(__a, __b, __c); +} +#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__) + /* vec_and */ #define __builtin_altivec_vand vec_and @@ -1387,6 +1483,21 @@ vec_cmpeq(vector unsigned int __a, vector unsigned int __b) __builtin_altivec_vcmpequw((vector int)__a, (vector int)__b); } +#ifdef __POWER8_VECTOR__ +static vector bool long long __ATTRS_o_ai +vec_cmpeq(vector signed long long __a, vector signed long long __b) +{ + return (vector bool long long) __builtin_altivec_vcmpequd(__a, __b); +} + +static vector bool long long __ATTRS_o_ai +vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) +{ + return (vector bool long long) + __builtin_altivec_vcmpequd((vector long long)__a, (vector long long) __b); +} +#endif + static vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a, vector float __b) { @@ -1447,6 +1558,20 @@ vec_cmpgt(vector unsigned int __a, vector unsigned int __b) return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b); } +#ifdef __POWER8_VECTOR__ +static vector bool long long __ATTRS_o_ai +vec_cmpgt(vector signed long long __a, vector signed long long __b) +{ + return (vector bool long long)__builtin_altivec_vcmpgtsd(__a, __b); +} + +static vector bool long long __ATTRS_o_ai +vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) +{ + return (vector bool long long)__builtin_altivec_vcmpgtud(__a, __b); +} +#endif + static vector bool int __ATTRS_o_ai vec_cmpgt(vector float __a, vector float __b) { @@ -2270,7 +2395,7 @@ vec_vlogefp(vector float __a) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsl(int __a, const signed char *__b) { @@ -2289,7 +2414,7 @@ vec_lvsl(int __a, const signed char *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsl(int __a, const unsigned char *__b) { @@ -2308,7 +2433,7 @@ vec_lvsl(int __a, const unsigned char *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsl(int __a, const short *__b) { @@ -2327,7 +2452,7 @@ vec_lvsl(int __a, const short *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsl(int __a, const unsigned short *__b) { @@ -2346,7 +2471,7 @@ vec_lvsl(int __a, const unsigned short *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsl(int __a, const int *__b) { @@ -2365,7 +2490,7 @@ vec_lvsl(int __a, const int *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsl(int __a, const unsigned int *__b) { @@ -2384,7 +2509,7 @@ vec_lvsl(int __a, const unsigned int *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsl(int __a, const float *__b) { @@ -2405,7 +2530,7 @@ vec_lvsl(int __a, const float *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsr(int __a, const signed char *__b) { @@ -2424,7 +2549,7 @@ vec_lvsr(int __a, const signed char *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsr(int __a, const unsigned char *__b) { @@ -2443,7 +2568,7 @@ vec_lvsr(int __a, const unsigned char *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsr(int __a, const short *__b) { @@ -2462,7 +2587,7 @@ vec_lvsr(int __a, const short *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsr(int __a, const unsigned short *__b) { @@ -2481,7 +2606,7 @@ vec_lvsr(int __a, const unsigned short *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsr(int __a, const int *__b) { @@ -2500,7 +2625,7 @@ vec_lvsr(int __a, const int *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsr(int __a, const unsigned int *__b) { @@ -2519,7 +2644,7 @@ vec_lvsr(int __a, const unsigned int *__b) #ifdef __LITTLE_ENDIAN__ static vector unsigned char __ATTRS_o_ai -__attribute__((deprecated("use assignment for unaligned little endian \ +__attribute__((__deprecated__("use assignment for unaligned little endian \ loads/stores"))) vec_lvsr(int __a, const float *__b) { @@ -2679,6 +2804,20 @@ vec_max(vector unsigned int __a, vector bool int __b) return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b); } +#ifdef __POWER8_VECTOR__ +static vector signed long long __ATTRS_o_ai +vec_max(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vmaxsd(__a, __b); +} + +static vector unsigned long long __ATTRS_o_ai +vec_max(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vmaxud(__a, __b); +} +#endif + static vector float __ATTRS_o_ai vec_max(vector float __a, vector float __b) { @@ -3327,6 +3466,20 @@ vec_min(vector unsigned int __a, vector bool int __b) return __builtin_altivec_vminuw(__a, (vector unsigned int)__b); } +#ifdef __POWER8_VECTOR__ +static vector signed long long __ATTRS_o_ai +vec_min(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vminsd(__a, __b); +} + +static vector unsigned long long __ATTRS_o_ai +vec_min(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vminud(__a, __b); +} +#endif + static vector float __ATTRS_o_ai vec_min(vector float __a, vector float __b) { @@ -3762,6 +3915,28 @@ vec_mule(vector unsigned short __a, vector unsigned short __b) #endif } +#ifdef __POWER8_VECTOR__ +static vector signed long long __ATTRS_o_ai +vec_mule(vector signed int __a, vector signed int __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmulosw(__a, __b); +#else + return __builtin_altivec_vmulesw(__a, __b); +#endif +} + +static vector unsigned long long __ATTRS_o_ai +vec_mule(vector unsigned int __a, vector unsigned int __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmulouw(__a, __b); +#else + return __builtin_altivec_vmuleuw(__a, __b); +#endif +} +#endif + /* vec_vmulesb */ static vector short __attribute__((__always_inline__)) @@ -3852,6 +4027,28 @@ vec_mulo(vector unsigned short __a, vector unsigned short __b) #endif } +#ifdef __POWER8_VECTOR__ +static vector signed long long __ATTRS_o_ai +vec_mulo(vector signed int __a, vector signed int __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmulesw(__a, __b); +#else + return __builtin_altivec_vmulosw(__a, __b); +#endif +} + +static vector unsigned long long __ATTRS_o_ai +vec_mulo(vector unsigned int __a, vector unsigned int __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmuleuw(__a, __b); +#else + return __builtin_altivec_vmulouw(__a, __b); +#endif +} +#endif + /* vec_vmulosb */ static vector short __attribute__((__always_inline__)) @@ -4525,6 +4722,58 @@ vec_vpkuwum(vector bool int __a, vector bool int __b) #endif } +/* vec_vpkudum */ + +#ifdef __POWER8_VECTOR__ +#define __builtin_altivec_vpkudum vec_vpkudum + +static vector int __ATTRS_o_ai +vec_vpkudum(vector long long __a, vector long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return (vector int)vec_perm(__a, __b, (vector unsigned char) + (0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B)); +#else + return (vector int)vec_perm(__a, __b, (vector unsigned char) + (0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F)); +#endif +} + +static vector unsigned int __ATTRS_o_ai +vec_vpkudum(vector unsigned long long __a, vector unsigned long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return (vector unsigned int)vec_perm(__a, __b, (vector unsigned char) + (0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B)); +#else + return (vector unsigned int)vec_perm(__a, __b, (vector unsigned char) + (0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F)); +#endif +} + +static vector bool int __ATTRS_o_ai +vec_vpkudum(vector bool long long __a, vector bool long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return (vector bool int)vec_perm((vector long long)__a, + (vector long long)__b, + (vector unsigned char) + (0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B)); +#else + return (vector bool int)vec_perm((vector long long)__a, + (vector long long)__b, + (vector unsigned char) + (0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F)); +#endif +} +#endif + /* vec_packpx */ static vector pixel __attribute__((__always_inline__)) @@ -4591,6 +4840,28 @@ vec_packs(vector unsigned int __a, vector unsigned int __b) #endif } +#ifdef __POWER8_VECTOR__ +static vector int __ATTRS_o_ai +vec_packs(vector long long __a, vector long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vpksdss(__b, __a); +#else + return __builtin_altivec_vpksdss(__a, __b); +#endif +} + +static vector unsigned int __ATTRS_o_ai +vec_packs(vector unsigned long long __a, vector unsigned long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vpkudus(__b, __a); +#else + return __builtin_altivec_vpkudus(__a, __b); +#endif +} +#endif + /* vec_vpkshss */ static vector signed char __attribute__((__always_inline__)) @@ -4603,6 +4874,20 @@ vec_vpkshss(vector short __a, vector short __b) #endif } +/* vec_vpksdss */ + +#ifdef __POWER8_VECTOR__ +static vector int __ATTRS_o_ai +vec_vpksdss(vector long long __a, vector long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vpksdss(__b, __a); +#else + return __builtin_altivec_vpksdss(__a, __b); +#endif +} +#endif + /* vec_vpkuhus */ static vector unsigned char __attribute__((__always_inline__)) @@ -4615,6 +4900,20 @@ vec_vpkuhus(vector unsigned short __a, vector unsigned short __b) #endif } +/* vec_vpkudus */ + +#ifdef __POWER8_VECTOR__ +static vector unsigned int __attribute__((__always_inline__)) +vec_vpkudus(vector unsigned long long __a, vector unsigned long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vpkudus(__b, __a); +#else + return __builtin_altivec_vpkudus(__a, __b); +#endif +} +#endif + /* vec_vpkswss */ static vector signed short __attribute__((__always_inline__)) @@ -4681,6 +4980,28 @@ vec_packsu(vector unsigned int __a, vector unsigned int __b) #endif } +#ifdef __POWER8_VECTOR__ +static vector unsigned int __ATTRS_o_ai +vec_packsu(vector long long __a, vector long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vpksdus(__b, __a); +#else + return __builtin_altivec_vpksdus(__a, __b); +#endif +} + +static vector unsigned int __ATTRS_o_ai +vec_packsu(vector unsigned long long __a, vector unsigned long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vpkudus(__b, __a); +#else + return __builtin_altivec_vpkudus(__a, __b); +#endif +} +#endif + /* vec_vpkshus */ static vector unsigned char __ATTRS_o_ai @@ -4725,6 +5046,20 @@ vec_vpkswus(vector unsigned int __a, vector unsigned int __b) #endif } +/* vec_vpksdus */ + +#ifdef __POWER8_VECTOR__ +static vector unsigned int __ATTRS_o_ai +vec_vpksdus(vector long long __a, vector long long __b) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vpksdus(__b, __a); +#else + return __builtin_altivec_vpksdus(__a, __b); +#endif +} +#endif + /* vec_perm */ // The vperm instruction is defined architecturally with a big-endian bias. @@ -5095,6 +5430,20 @@ vec_rl(vector unsigned int __a, vector unsigned int __b) return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b); } +#ifdef __POWER8_VECTOR__ +static vector signed long long __ATTRS_o_ai +vec_rl(vector signed long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vrld(__a, __b); +} + +static vector unsigned long long __ATTRS_o_ai +vec_rl(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vrld(__a, __b); +} +#endif + /* vec_vrlb */ static vector signed char __ATTRS_o_ai @@ -5465,6 +5814,20 @@ vec_sl(vector unsigned int __a, vector unsigned int __b) return __a << __b; } +#ifdef __POWER8_VECTOR__ +static vector signed long long __ATTRS_o_ai +vec_sl(vector signed long long __a, vector unsigned long long __b) +{ + return __a << (vector long long)__b; +} + +static vector unsigned long long __ATTRS_o_ai +vec_sl(vector unsigned long long __a, vector unsigned long long __b) +{ + return __a << __b; +} +#endif + /* vec_vslb */ #define __builtin_altivec_vslb vec_vslb @@ -6566,6 +6929,20 @@ vec_sr(vector unsigned int __a, vector unsigned int __b) return __a >> __b; } +#ifdef __POWER8_VECTOR__ +static vector signed long long __ATTRS_o_ai +vec_sr(vector signed long long __a, vector unsigned long long __b) +{ + return __a >> (vector long long)__b; +} + +static vector unsigned long long __ATTRS_o_ai +vec_sr(vector unsigned long long __a, vector unsigned long long __b) +{ + return __a >> __b; +} +#endif + /* vec_vsrb */ #define __builtin_altivec_vsrb vec_vsrb @@ -6652,6 +7029,20 @@ vec_sra(vector unsigned int __a, vector unsigned int __b) return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b); } +#ifdef __POWER8_VECTOR__ +static vector signed long long __ATTRS_o_ai +vec_sra(vector signed long long __a, vector unsigned long long __b) +{ + return __a >> __b; +} + +static vector unsigned long long __ATTRS_o_ai +vec_sra(vector unsigned long long __a, vector unsigned long long __b) +{ + return (vector unsigned long long) ( (vector signed long long) __a >> __b); +} +#endif + /* vec_vsrab */ static vector signed char __ATTRS_o_ai @@ -8224,6 +8615,20 @@ vec_sub(vector unsigned int __a, vector bool int __b) return __a - (vector unsigned int)__b; } +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static vector signed __int128 __ATTRS_o_ai +vec_sub(vector signed __int128 __a, vector signed __int128 __b) +{ + return __a - __b; +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_sub(vector unsigned __int128 __a, vector unsigned __int128 __b) +{ + return __a - __b; +} +#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__) + static vector float __ATTRS_o_ai vec_sub(vector float __a, vector float __b) { @@ -8362,12 +8767,26 @@ vec_vsubfp(vector float __a, vector float __b) /* vec_subc */ -static vector unsigned int __attribute__((__always_inline__)) +static vector unsigned int __ATTRS_o_ai vec_subc(vector unsigned int __a, vector unsigned int __b) { return __builtin_altivec_vsubcuw(__a, __b); } +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +static vector unsigned __int128 __ATTRS_o_ai +vec_subc(vector unsigned __int128 __a, vector unsigned __int128 __b) +{ + return __builtin_altivec_vsubcuq(__a, __b); +} + +static vector signed __int128 __ATTRS_o_ai +vec_subc(vector signed __int128 __a, vector signed __int128 __b) +{ + return __builtin_altivec_vsubcuq(__a, __b); +} +#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__) + /* vec_vsubcuw */ static vector unsigned int __attribute__((__always_inline__)) @@ -8606,6 +9025,68 @@ vec_vsubuws(vector unsigned int __a, vector bool int __b) return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b); } +#if defined(__POWER8_VECTOR__) && defined(__powerpc64__) +/* vec_vsubuqm */ + +static vector signed __int128 __ATTRS_o_ai +vec_vsubuqm(vector signed __int128 __a, vector signed __int128 __b) +{ + return __a - __b; +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b) +{ + return __a - __b; +} + +/* vec_vsubeuqm */ + +static vector signed __int128 __ATTRS_o_ai +vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) +{ + return __builtin_altivec_vsubeuqm(__a, __b, __c); +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) +{ + return __builtin_altivec_vsubeuqm(__a, __b, __c); +} + +/* vec_vsubcuq */ + +static vector signed __int128 __ATTRS_o_ai +vec_vsubcuq(vector signed __int128 __a, vector signed __int128 __b) +{ + return __builtin_altivec_vsubcuq(__a, __b); +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_vsubcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) +{ + return __builtin_altivec_vsubcuq(__a, __b); +} + +/* vec_vsubecuq */ + +static vector signed __int128 __ATTRS_o_ai +vec_vsubecuq(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) +{ + return __builtin_altivec_vsubecuq(__a, __b, __c); +} + +static vector unsigned __int128 __ATTRS_o_ai +vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) +{ + return __builtin_altivec_vsubecuq(__a, __b, __c); +} +#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__) + /* vec_sum4s */ static vector int __ATTRS_o_ai @@ -8797,6 +9278,28 @@ vec_unpackh(vector pixel __a) #endif } +#ifdef __POWER8_VECTOR__ +static vector long long __ATTRS_o_ai +vec_unpackh(vector int __a) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vupklsw(__a); +#else + return __builtin_altivec_vupkhsw(__a); +#endif +} + +static vector bool long long __ATTRS_o_ai +vec_unpackh(vector bool int __a) +{ +#ifdef __LITTLE_ENDIAN__ + return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a); +#else + return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a); +#endif +} +#endif + /* vec_vupkhsb */ static vector short __ATTRS_o_ai @@ -8851,6 +9354,30 @@ vec_vupkhsh(vector pixel __a) #endif } +/* vec_vupkhsw */ + +#ifdef __POWER8_VECTOR__ +static vector long long __ATTRS_o_ai +vec_vupkhsw(vector int __a) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vupklsw(__a); +#else + return __builtin_altivec_vupkhsw(__a); +#endif +} + +static vector bool long long __ATTRS_o_ai +vec_vupkhsw(vector bool int __a) +{ +#ifdef __LITTLE_ENDIAN__ + return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a); +#else + return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a); +#endif +} +#endif + /* vec_unpackl */ static vector short __ATTRS_o_ai @@ -8903,6 +9430,28 @@ vec_unpackl(vector pixel __a) #endif } +#ifdef __POWER8_VECTOR__ +static vector long long __ATTRS_o_ai +vec_unpackl(vector int __a) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vupkhsw(__a); +#else + return __builtin_altivec_vupklsw(__a); +#endif +} + +static vector bool long long __ATTRS_o_ai +vec_unpackl(vector bool int __a) +{ +#ifdef __LITTLE_ENDIAN__ + return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a); +#else + return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a); +#endif +} +#endif + /* vec_vupklsb */ static vector short __ATTRS_o_ai @@ -8957,6 +9506,30 @@ vec_vupklsh(vector pixel __a) #endif } +/* vec_vupklsw */ + +#ifdef __POWER8_VECTOR__ +static vector long long __ATTRS_o_ai +vec_vupklsw(vector int __a) +{ +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vupkhsw(__a); +#else + return __builtin_altivec_vupklsw(__a); +#endif +} + +static vector bool long long __ATTRS_o_ai +vec_vupklsw(vector bool int __a) +{ +#ifdef __LITTLE_ENDIAN__ + return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a); +#else + return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a); +#endif +} +#endif + /* vec_vsx_ld */ #ifdef __VSX__ @@ -10887,6 +11460,55 @@ vec_all_eq(vector bool int __a, vector bool int __b) return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a, (vector int)__b); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_all_eq(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b); +} + +static int __ATTRS_o_ai +vec_all_eq(vector long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b); +} + +static int __ATTRS_o_ai +vec_all_eq(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, + (vector long long)__b); +} + +static int __ATTRS_o_ai +vec_all_eq(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, + (vector long long)__b); +} + +static int __ATTRS_o_ai +vec_all_eq(vector bool long long __a, vector long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, + (vector long long)__b); +} + +static int __ATTRS_o_ai +vec_all_eq(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, + (vector long long)__b); +} + +static int __ATTRS_o_ai +vec_all_eq(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a, + (vector long long)__b); +} +#endif + static int __ATTRS_o_ai vec_all_eq(vector float __a, vector float __b) { @@ -11033,6 +11655,56 @@ vec_all_ge(vector bool int __a, vector bool int __b) (vector unsigned int)__a); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_all_ge(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __b, __a); +} +static int __ATTRS_o_ai +vec_all_ge(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, (vector signed long long)__b, + __a); +} + +static int __ATTRS_o_ai +vec_all_ge(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b, __a); +} + +static int __ATTRS_o_ai +vec_all_ge(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b, + __a); +} + +static int __ATTRS_o_ai +vec_all_ge(vector bool long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, + (vector unsigned long long)__b, + (vector unsigned long long)__a); +} + +static int __ATTRS_o_ai +vec_all_ge(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b, + (vector unsigned long long)__a); +} + +static int __ATTRS_o_ai +vec_all_ge(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, + (vector unsigned long long)__b, + (vector unsigned long long)__a); +} +#endif + static int __ATTRS_o_ai vec_all_ge(vector float __a, vector float __b) { @@ -11179,6 +11851,56 @@ vec_all_gt(vector bool int __a, vector bool int __b) (vector unsigned int)__b); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_all_gt(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a, __b); +} +static int __ATTRS_o_ai +vec_all_gt(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_all_gt(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, __b); +} + +static int __ATTRS_o_ai +vec_all_gt(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, + (vector unsigned long long)__b); +} + +static int __ATTRS_o_ai +vec_all_gt(vector bool long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, + (vector unsigned long long)__a, + (vector unsigned long long)__b); +} + +static int __ATTRS_o_ai +vec_all_gt(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a, + __b); +} + +static int __ATTRS_o_ai +vec_all_gt(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, + (vector unsigned long long)__a, + (vector unsigned long long)__b); +} +#endif + static int __ATTRS_o_ai vec_all_gt(vector float __a, vector float __b) { @@ -11333,6 +12055,57 @@ vec_all_le(vector bool int __a, vector bool int __b) (vector unsigned int)__b); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_all_le(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a, __b); +} + +static int __ATTRS_o_ai +vec_all_le(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, __b); +} + +static int __ATTRS_o_ai +vec_all_le(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_all_le(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, + (vector unsigned long long)__b); +} + +static int __ATTRS_o_ai +vec_all_le(vector bool long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, + (vector unsigned long long)__a, + (vector unsigned long long)__b); +} + +static int __ATTRS_o_ai +vec_all_le(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a, + __b); +} + +static int __ATTRS_o_ai +vec_all_le(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ, + (vector unsigned long long)__a, + (vector unsigned long long)__b); +} +#endif + static int __ATTRS_o_ai vec_all_le(vector float __a, vector float __b) { @@ -11479,6 +12252,57 @@ vec_all_lt(vector bool int __a, vector bool int __b) (vector unsigned int)__a); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_all_lt(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __b, __a); +} + +static int __ATTRS_o_ai +vec_all_lt(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b, __a); +} + +static int __ATTRS_o_ai +vec_all_lt(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_LT, (vector signed long long)__b, + __a); +} + +static int __ATTRS_o_ai +vec_all_lt(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b, + __a); +} + +static int __ATTRS_o_ai +vec_all_lt(vector bool long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, + (vector unsigned long long)__b, + (vector unsigned long long)__a); +} + +static int __ATTRS_o_ai +vec_all_lt(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b, + (vector unsigned long long)__a); +} + +static int __ATTRS_o_ai +vec_all_lt(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT, + (vector unsigned long long)__b, + (vector unsigned long long)__a); +} +#endif + static int __ATTRS_o_ai vec_all_lt(vector float __a, vector float __b) { @@ -11633,6 +12457,56 @@ vec_all_ne(vector bool int __a, vector bool int __b) return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a, (vector int)__b); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_all_ne(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a, __b); +} + +static int __ATTRS_o_ai +vec_all_ne(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector long long)__a, + (vector long long)__b); +} + +static int __ATTRS_o_ai +vec_all_ne(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_all_ne(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_all_ne(vector bool long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_all_ne(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_all_ne(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a, + (vector signed long long)__b); +} +#endif + static int __ATTRS_o_ai vec_all_ne(vector float __a, vector float __b) { @@ -11837,6 +12711,61 @@ vec_any_eq(vector bool int __a, vector bool int __b) __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a, (vector int)__b); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_any_eq(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a, __b); +} + +static int __ATTRS_o_ai +vec_any_eq(vector unsigned long long __a, vector unsigned long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector long long)__a, + (vector long long)__b); +} + +static int __ATTRS_o_ai +vec_any_eq(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_any_eq(vector unsigned long long __a, vector bool long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_any_eq(vector bool long long __a, vector signed long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_any_eq(vector bool long long __a, vector unsigned long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_any_eq(vector bool long long __a, vector bool long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector signed long long)__a, + (vector signed long long)__b); +} +#endif + static int __ATTRS_o_ai vec_any_eq(vector float __a, vector float __b) { @@ -11985,6 +12914,57 @@ vec_any_ge(vector bool int __a, vector bool int __b) (vector unsigned int)__a); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_any_ge(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __b, __a); +} + +static int __ATTRS_o_ai +vec_any_ge(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b, __a); +} + +static int __ATTRS_o_ai +vec_any_ge(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, + (vector signed long long)__b, __a); +} + +static int __ATTRS_o_ai +vec_any_ge(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, + (vector unsigned long long)__b, __a); +} + +static int __ATTRS_o_ai +vec_any_ge(vector bool long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, + (vector unsigned long long)__b, + (vector unsigned long long)__a); +} + +static int __ATTRS_o_ai +vec_any_ge(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b, + (vector unsigned long long)__a); +} + +static int __ATTRS_o_ai +vec_any_ge(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, + (vector unsigned long long)__b, + (vector unsigned long long)__a); +} +#endif + static int __ATTRS_o_ai vec_any_ge(vector float __a, vector float __b) { @@ -12135,6 +13115,58 @@ vec_any_gt(vector bool int __a, vector bool int __b) (vector unsigned int)__b); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_any_gt(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a, __b); +} + +static int __ATTRS_o_ai +vec_any_gt(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, __b); +} + +static int __ATTRS_o_ai +vec_any_gt(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a, + (vector signed long long)__b); +} + + +static int __ATTRS_o_ai +vec_any_gt(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, + (vector unsigned long long)__b); +} + +static int __ATTRS_o_ai +vec_any_gt(vector bool long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, + (vector unsigned long long)__a, + (vector unsigned long long)__b); +} + +static int __ATTRS_o_ai +vec_any_gt(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, + (vector unsigned long long)__a, __b); +} + +static int __ATTRS_o_ai +vec_any_gt(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, + (vector unsigned long long)__a, + (vector unsigned long long)__b); +} +#endif + static int __ATTRS_o_ai vec_any_gt(vector float __a, vector float __b) { @@ -12285,6 +13317,57 @@ vec_any_le(vector bool int __a, vector bool int __b) (vector unsigned int)__b); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_any_le(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a, __b); +} + +static int __ATTRS_o_ai +vec_any_le(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, __b); +} + +static int __ATTRS_o_ai +vec_any_le(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_any_le(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, + (vector unsigned long long)__b); +} + +static int __ATTRS_o_ai +vec_any_le(vector bool long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, + (vector unsigned long long)__a, + (vector unsigned long long)__b); +} + +static int __ATTRS_o_ai +vec_any_le(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, + (vector unsigned long long)__a, __b); +} + +static int __ATTRS_o_ai +vec_any_le(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, + (vector unsigned long long)__a, + (vector unsigned long long)__b); +} +#endif + static int __ATTRS_o_ai vec_any_le(vector float __a, vector float __b) { @@ -12435,6 +13518,57 @@ vec_any_lt(vector bool int __a, vector bool int __b) (vector unsigned int)__a); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_any_lt(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __b, __a); +} + +static int __ATTRS_o_ai +vec_any_lt(vector unsigned long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, __a); +} + +static int __ATTRS_o_ai +vec_any_lt(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, + (vector signed long long)__b, __a); +} + +static int __ATTRS_o_ai +vec_any_lt(vector unsigned long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, + (vector unsigned long long)__b, __a); +} + +static int __ATTRS_o_ai +vec_any_lt(vector bool long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, + (vector unsigned long long)__b, + (vector unsigned long long)__a); +} + +static int __ATTRS_o_ai +vec_any_lt(vector bool long long __a, vector unsigned long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, + (vector unsigned long long)__a); +} + +static int __ATTRS_o_ai +vec_any_lt(vector bool long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, + (vector unsigned long long)__b, + (vector unsigned long long)__a); +} +#endif + static int __ATTRS_o_ai vec_any_lt(vector float __a, vector float __b) { @@ -12607,6 +13741,61 @@ vec_any_ne(vector bool int __a, vector bool int __b) __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a, (vector int)__b); } +#ifdef __POWER8_VECTOR__ +static int __ATTRS_o_ai +vec_any_ne(vector signed long long __a, vector signed long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b); +} + +static int __ATTRS_o_ai +vec_any_ne(vector unsigned long long __a, vector unsigned long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a, + (vector long long)__b); +} + +static int __ATTRS_o_ai +vec_any_ne(vector signed long long __a, vector bool long long __b) +{ + return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_any_ne(vector unsigned long long __a, vector bool long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_any_ne(vector bool long long __a, vector signed long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_any_ne(vector bool long long __a, vector unsigned long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a, + (vector signed long long)__b); +} + +static int __ATTRS_o_ai +vec_any_ne(vector bool long long __a, vector bool long long __b) +{ + return + __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector signed long long)__a, + (vector signed long long)__b); +} +#endif + static int __ATTRS_o_ai vec_any_ne(vector float __a, vector float __b) { @@ -12661,6 +13850,133 @@ vec_any_out(vector float __a, vector float __b) return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b); } +/* Power 8 Crypto functions +Note: We diverge from the current GCC implementation with regard +to cryptography and related functions as follows: +- Only the SHA and AES instructions and builtins are disabled by -mno-crypto +- The remaining ones are only available on Power8 and up so + require -mpower8-vector +The justification for this is that export requirements require that +Category:Vector.Crypto is optional (i.e. compliant hardware may not provide +support). As a result, we need to be able to turn off support for those. +The remaining ones (currently controlled by -mcrypto for GCC) still +need to be provided on compliant hardware even if Vector.Crypto is not +provided. +FIXME: the naming convention for the builtins will be adjusted due +to the inconsistency (__builtin_crypto_ prefix on builtins that cannot be +removed with -mno-crypto). This is under development. +*/ +#ifdef __CRYPTO__ +static vector unsigned long long __attribute__((__always_inline__)) +__builtin_crypto_vsbox (vector unsigned long long __a) +{ + return __builtin_altivec_crypto_vsbox(__a); +} + +static vector unsigned long long __attribute__((__always_inline__)) +__builtin_crypto_vcipher (vector unsigned long long __a, + vector unsigned long long __b) +{ + return __builtin_altivec_crypto_vcipher(__a, __b); +} + +static vector unsigned long long __attribute__((__always_inline__)) +__builtin_crypto_vcipherlast (vector unsigned long long __a, + vector unsigned long long __b) +{ + return __builtin_altivec_crypto_vcipherlast(__a, __b); +} + +static vector unsigned long long __attribute__((__always_inline__)) +__builtin_crypto_vncipher (vector unsigned long long __a, + vector unsigned long long __b) +{ + return __builtin_altivec_crypto_vncipher(__a, __b); +} + +static vector unsigned long long __attribute__((__always_inline__)) +__builtin_crypto_vncipherlast (vector unsigned long long __a, + vector unsigned long long __b) +{ + return __builtin_altivec_crypto_vncipherlast(__a, __b); +} + + +#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad +#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw +#endif + +#ifdef __POWER8_VECTOR__ +static vector unsigned char __ATTRS_o_ai +__builtin_crypto_vpermxor (vector unsigned char __a, + vector unsigned char __b, + vector unsigned char __c) +{ + return __builtin_altivec_crypto_vpermxor(__a, __b, __c); +} + +static vector unsigned short __ATTRS_o_ai +__builtin_crypto_vpermxor (vector unsigned short __a, + vector unsigned short __b, + vector unsigned short __c) +{ + return (vector unsigned short) + __builtin_altivec_crypto_vpermxor((vector unsigned char) __a, + (vector unsigned char) __b, + (vector unsigned char) __c); +} + +static vector unsigned int __ATTRS_o_ai +__builtin_crypto_vpermxor (vector unsigned int __a, + vector unsigned int __b, + vector unsigned int __c) +{ + return (vector unsigned int) + __builtin_altivec_crypto_vpermxor((vector unsigned char) __a, + (vector unsigned char) __b, + (vector unsigned char) __c); +} + +static vector unsigned long long __ATTRS_o_ai +__builtin_crypto_vpermxor (vector unsigned long long __a, + vector unsigned long long __b, + vector unsigned long long __c) +{ + return (vector unsigned long long) + __builtin_altivec_crypto_vpermxor((vector unsigned char) __a, + (vector unsigned char) __b, + (vector unsigned char) __c); +} + +static vector unsigned char __ATTRS_o_ai +__builtin_crypto_vpmsumb (vector unsigned char __a, + vector unsigned char __b) +{ + return __builtin_altivec_crypto_vpmsumb(__a, __b); +} + +static vector unsigned short __ATTRS_o_ai +__builtin_crypto_vpmsumb (vector unsigned short __a, + vector unsigned short __b) +{ + return __builtin_altivec_crypto_vpmsumh(__a, __b); +} + +static vector unsigned int __ATTRS_o_ai +__builtin_crypto_vpmsumb (vector unsigned int __a, + vector unsigned int __b) +{ + return __builtin_altivec_crypto_vpmsumw(__a, __b); +} + +static vector unsigned long long __ATTRS_o_ai +__builtin_crypto_vpmsumb (vector unsigned long long __a, + vector unsigned long long __b) +{ + return __builtin_altivec_crypto_vpmsumd(__a, __b); +} +#endif + #undef __ATTRS_o_ai #endif /* __ALTIVEC_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/arm_acle.h b/contrib/llvm/tools/clang/lib/Headers/arm_acle.h index 814df2c..6c56f3b 100644 --- a/contrib/llvm/tools/clang/lib/Headers/arm_acle.h +++ b/contrib/llvm/tools/clang/lib/Headers/arm_acle.h @@ -45,23 +45,23 @@ extern "C" { /* 8.4 Hints */ #if !defined(_MSC_VER) -static __inline__ void __attribute__((always_inline, nodebug)) __wfi(void) { +static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) { __builtin_arm_wfi(); } -static __inline__ void __attribute__((always_inline, nodebug)) __wfe(void) { +static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) { __builtin_arm_wfe(); } -static __inline__ void __attribute__((always_inline, nodebug)) __sev(void) { +static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) { __builtin_arm_sev(); } -static __inline__ void __attribute__((always_inline, nodebug)) __sevl(void) { +static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) { __builtin_arm_sevl(); } -static __inline__ void __attribute__((always_inline, nodebug)) __yield(void) { +static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) { __builtin_arm_yield(); } #endif @@ -71,7 +71,7 @@ static __inline__ void __attribute__((always_inline, nodebug)) __yield(void) { #endif /* 8.5 Swap */ -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __swp(uint32_t x, volatile uint32_t *p) { uint32_t v; do v = __builtin_arm_ldrex(p); while (__builtin_arm_strex(x, p)); @@ -102,28 +102,28 @@ static __inline__ uint32_t __attribute__((always_inline, nodebug)) #endif /* 8.7 NOP */ -static __inline__ void __attribute__((always_inline, nodebug)) __nop(void) { +static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) { __builtin_arm_nop(); } /* 9 DATA-PROCESSING INTRINSICS */ /* 9.2 Miscellaneous data-processing intrinsics */ /* ROR */ -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __ror(uint32_t x, uint32_t y) { y %= 32; if (y == 0) return x; return (x >> y) | (x << (32 - y)); } -static __inline__ uint64_t __attribute__((always_inline, nodebug)) +static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) __rorll(uint64_t x, uint32_t y) { y %= 64; if (y == 0) return x; return (x >> y) | (x << (64 - y)); } -static __inline__ unsigned long __attribute__((always_inline, nodebug)) +static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) __rorl(unsigned long x, uint32_t y) { #if __SIZEOF_LONG__ == 4 return __ror(x, y); @@ -134,28 +134,28 @@ static __inline__ unsigned long __attribute__((always_inline, nodebug)) /* CLZ */ -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __clz(uint32_t t) { return __builtin_clz(t); } -static __inline__ unsigned long __attribute__((always_inline, nodebug)) +static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) __clzl(unsigned long t) { return __builtin_clzl(t); } -static __inline__ uint64_t __attribute__((always_inline, nodebug)) +static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) __clzll(uint64_t t) { return __builtin_clzll(t); } /* REV */ -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __rev(uint32_t t) { return __builtin_bswap32(t); } -static __inline__ unsigned long __attribute__((always_inline, nodebug)) +static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) __revl(unsigned long t) { #if __SIZEOF_LONG__ == 4 return __builtin_bswap32(t); @@ -164,40 +164,40 @@ static __inline__ unsigned long __attribute__((always_inline, nodebug)) #endif } -static __inline__ uint64_t __attribute__((always_inline, nodebug)) +static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) __revll(uint64_t t) { return __builtin_bswap64(t); } /* REV16 */ -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __rev16(uint32_t t) { return __ror(__rev(t), 16); } -static __inline__ unsigned long __attribute__((always_inline, nodebug)) +static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) __rev16l(unsigned long t) { return __rorl(__revl(t), sizeof(long) / 2); } -static __inline__ uint64_t __attribute__((always_inline, nodebug)) +static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) __rev16ll(uint64_t t) { return __rorll(__revll(t), 32); } /* REVSH */ -static __inline__ int16_t __attribute__((always_inline, nodebug)) +static __inline__ int16_t __attribute__((__always_inline__, __nodebug__)) __revsh(int16_t t) { return __builtin_bswap16(t); } /* RBIT */ -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __rbit(uint32_t t) { return __builtin_arm_rbit(t); } -static __inline__ uint64_t __attribute__((always_inline, nodebug)) +static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) __rbitll(uint64_t t) { #if __ARM_32BIT_STATE return (((uint64_t) __builtin_arm_rbit(t)) << 32) | @@ -207,7 +207,7 @@ static __inline__ uint64_t __attribute__((always_inline, nodebug)) #endif } -static __inline__ unsigned long __attribute__((always_inline, nodebug)) +static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) __rbitl(unsigned long t) { #if __SIZEOF_LONG__ == 4 return __rbit(t); @@ -230,17 +230,17 @@ static __inline__ unsigned long __attribute__((always_inline, nodebug)) /* 9.4.2 Saturating addition and subtraction intrinsics */ #if __ARM_32BIT_STATE -static __inline__ int32_t __attribute__((always_inline, nodebug)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) __qadd(int32_t t, int32_t v) { return __builtin_arm_qadd(t, v); } -static __inline__ int32_t __attribute__((always_inline, nodebug)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) __qsub(int32_t t, int32_t v) { return __builtin_arm_qsub(t, v); } -static __inline__ int32_t __attribute__((always_inline, nodebug)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) __qdbl(int32_t t) { return __builtin_arm_qadd(t, t); } @@ -248,42 +248,42 @@ __qdbl(int32_t t) { /* 9.7 CRC32 intrinsics */ #if __ARM_FEATURE_CRC32 -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __crc32b(uint32_t a, uint8_t b) { return __builtin_arm_crc32b(a, b); } -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __crc32h(uint32_t a, uint16_t b) { return __builtin_arm_crc32h(a, b); } -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __crc32w(uint32_t a, uint32_t b) { return __builtin_arm_crc32w(a, b); } -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __crc32d(uint32_t a, uint64_t b) { return __builtin_arm_crc32d(a, b); } -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __crc32cb(uint32_t a, uint8_t b) { return __builtin_arm_crc32cb(a, b); } -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __crc32ch(uint32_t a, uint16_t b) { return __builtin_arm_crc32ch(a, b); } -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __crc32cw(uint32_t a, uint32_t b) { return __builtin_arm_crc32cw(a, b); } -static __inline__ uint32_t __attribute__((always_inline, nodebug)) +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __crc32cd(uint32_t a, uint64_t b) { return __builtin_arm_crc32cd(a, b); } diff --git a/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h b/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h index 394fdfe..e1e639d 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h @@ -160,7 +160,7 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) #define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \ __m256i __V1 = (V1); \ __m256i __V2 = (V2); \ - (__m256d)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \ + (__m256i)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \ (((M) & 0x01) ? 16 : 0), \ (((M) & 0x02) ? 17 : 1), \ (((M) & 0x04) ? 18 : 2), \ @@ -542,6 +542,8 @@ _mm256_sign_epi32(__m256i __a, __m256i __b) __m256i __a = (a); \ (__m256i)__builtin_ia32_pslldqi256(__a, (count)*8); }) +#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count)) + static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_slli_epi16(__m256i __a, int __count) { @@ -606,6 +608,8 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) __m256i __a = (a); \ (__m256i)__builtin_ia32_psrldqi256(__a, (count)*8); }) +#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count)) + static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_srli_epi16(__m256i __a, int __count) { @@ -756,6 +760,12 @@ _mm_broadcastss_ps(__m128 __X) return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X); } +static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) +_mm_broadcastsd_pd(__m128d __a) +{ + return __builtin_shufflevector(__a, __a, 0, 0); +} + static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) _mm256_broadcastss_ps(__m128 __X) { @@ -771,7 +781,7 @@ _mm256_broadcastsd_pd(__m128d __X) static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_broadcastsi128_si256(__m128i __X) { - return (__m256i)__builtin_ia32_vbroadcastsi256(__X); + return (__m256i)__builtin_shufflevector(__X, __X, 0, 1, 0, 1); } #define _mm_blend_epi32(V1, V2, M) __extension__ ({ \ @@ -874,14 +884,21 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256 __b) __m256i __V2 = (V2); \ (__m256i)__builtin_ia32_permti256(__V1, __V2, (M)); }) -#define _mm256_extracti128_si256(A, O) __extension__ ({ \ - __m256i __A = (A); \ - (__m128i)__builtin_ia32_extract128i256(__A, (O)); }) - -#define _mm256_inserti128_si256(V1, V2, O) __extension__ ({ \ - __m256i __V1 = (V1); \ - __m128i __V2 = (V2); \ - (__m256i)__builtin_ia32_insert128i256(__V1, __V2, (O)); }) +#define _mm256_extracti128_si256(V, M) __extension__ ({ \ + (__m128i)__builtin_shufflevector( \ + (__v4di)(V), \ + (__v4di)(_mm256_setzero_si256()), \ + (((M) & 1) ? 2 : 0), \ + (((M) & 1) ? 3 : 1) );}) + +#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \ + (__m256i)__builtin_shufflevector( \ + (__v4di)(V1), \ + (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ + (((M) & 1) ? 0 : 4), \ + (((M) & 1) ? 1 : 5), \ + (((M) & 1) ? 4 : 2), \ + (((M) & 1) ? 5 : 3) );}) static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_maskload_epi32(int const *__X, __m256i __M) diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h index bc4d4ac..d0591e4 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h @@ -21,15 +21,37 @@ * *===-----------------------------------------------------------------------=== */ +#ifndef __IMMINTRIN_H +#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead." +#endif #ifndef __AVX512BWINTRIN_H #define __AVX512BWINTRIN_H typedef unsigned int __mmask32; typedef unsigned long long __mmask64; -typedef char __v64qi __attribute__ ((vector_size (64))); +typedef char __v64qi __attribute__ ((__vector_size__ (64))); typedef short __v32hi __attribute__ ((__vector_size__ (64))); +static __inline __v64qi __attribute__ ((__always_inline__, __nodebug__)) +_mm512_setzero_qi (void) { + return (__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +static __inline __v32hi __attribute__ ((__always_inline__, __nodebug__)) +_mm512_setzero_hi (void) { + return (__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} /* Integer compare */ @@ -45,6 +67,18 @@ _mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { __u); } +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0, + __u); +} + static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) _mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) { return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b, @@ -57,4 +91,406 @@ _mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { __u); } +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, + __u); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, + (__mmask64)-1); +} + +static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, + __u); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_add_epi8 (__m512i __A, __m512i __B) { + return (__m512i) ((__v64qi) __A + (__v64qi) __B); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_qi (), + (__mmask64) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_sub_epi8 (__m512i __A, __m512i __B) { + return (__m512i) ((__v64qi) __A - (__v64qi) __B); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_qi (), + (__mmask64) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_add_epi16 (__m512i __A, __m512i __B) { + return (__m512i) ((__v32hi) __A + (__v32hi) __B); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_hi (), + (__mmask32) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_sub_epi16 (__m512i __A, __m512i __B) { + return (__m512i) ((__v32hi) __A - (__v32hi) __B); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_hi (), + (__mmask32) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mullo_epi16 (__m512i __A, __m512i __B) { + return (__m512i) ((__v32hi) __A * (__v32hi) __B); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_hi (), + (__mmask32) __U); +} + +#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), \ + (p), (__mmask64)-1); }) + +#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), \ + (p), (__mmask64)(m)); }) + +#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), \ + (p), (__mmask64)-1); }) + +#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), \ + (p), (__mmask64)(m)); }) + +#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), \ + (p), (__mmask32)-1); }) + +#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), \ + (p), (__mmask32)(m)); }) + +#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), \ + (p), (__mmask32)-1); }) + +#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), \ + (p), (__mmask32)(m)); }) + #endif diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512dqintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512dqintrin.h new file mode 100644 index 0000000..fd33be2 --- /dev/null +++ b/contrib/llvm/tools/clang/lib/Headers/avx512dqintrin.h @@ -0,0 +1,237 @@ +/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512DQINTRIN_H +#define __AVX512DQINTRIN_H + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mullo_epi64 (__m512i __A, __m512i __B) { + return (__m512i) ((__v8di) __A * (__v8di) __B); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_xor_pd (__m512d __A, __m512d __B) { + return (__m512d) ((__v8di) __A ^ (__v8di) __B); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_xor_ps (__m512 __A, __m512 __B) { + return (__m512) ((__v16si) __A ^ (__v16si) __B); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_or_pd (__m512d __A, __m512d __B) { + return (__m512d) ((__v8di) __A | (__v8di) __B); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_or_ps (__m512 __A, __m512 __B) { + return (__m512) ((__v16si) __A | (__v16si) __B); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_and_pd (__m512d __A, __m512d __B) { + return (__m512d) ((__v8di) __A & (__v8di) __B); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_and_ps (__m512 __A, __m512 __B) { + return (__m512) ((__v16si) __A & (__v16si) __B); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_andnot_pd (__m512d __A, __m512d __B) { + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline__ __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_andnot_ps (__m512 __A, __m512 __B) { + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline__ __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +#endif diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512erintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512erintrin.h index 1a5ea15..57c61aa 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512erintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512erintrin.h @@ -28,85 +28,259 @@ #define __AVX512ERINTRIN_H +// exp2a23 +#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \ + (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (R)); }) + +#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \ + (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), \ + (__mmask8)(M), (R)); }) + +#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \ + (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (R)); }) + +#define _mm512_exp2a23_pd(A) \ + _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_pd(S, M, A) \ + _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_pd(M, A) \ + _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \ + (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask8)-1, (R)); }) + +#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \ + (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), \ + (__mmask8)(M), (R)); }) + +#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \ + (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask8)(M), (R)); }) + +#define _mm512_exp2a23_ps(A) \ + _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_ps(S, M, A) \ + _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_ps(M, A) \ + _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) + // rsqrt28 -static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) -_mm512_rsqrt28_round_pd (__m512d __A, int __R) -{ - return (__m512d)__builtin_ia32_rsqrt28pd_mask ((__v8df)__A, - (__v8df)_mm512_setzero_pd(), - (__mmask8)-1, - __R); -} -static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) -_mm512_rsqrt28_round_ps(__m512 __A, int __R) -{ - return (__m512)__builtin_ia32_rsqrt28ps_mask ((__v16sf)__A, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)-1, - __R); -} - -static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) -_mm_rsqrt28_round_ss(__m128 __A, __m128 __B, int __R) -{ - return (__m128) __builtin_ia32_rsqrt28ss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1, - __R); -} - -static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) -_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R) -{ - return (__m128d) __builtin_ia32_rsqrt28sd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1, - __R); -} +#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \ + (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (R)); }) + +#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \ + (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), \ + (__mmask8)(M), (R)); }) + +#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \ + (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (R)); }) + +#define _mm512_rsqrt28_pd(A) \ + _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_pd(S, M, A) \ + _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_pd(M, A) \ + _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \ + (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (R)); }) + +#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \ + (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), \ + (__mmask16)(M), (R)); }) + +#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \ + (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (R)); }) + +#define _mm512_rsqrt28_ps(A) \ + _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_ps(S, M, A) \ + _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_ps(M, A) \ + _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \ + (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (R)); }) + +#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \ + (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(S), \ + (__mmask8)(M), (R)); }) + +#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \ + (__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(M), (R)); }) + +#define _mm_rsqrt28_ss(A, B) \ + _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_ss(S, M, A, B) \ + _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_ss(M, A, B) \ + _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \ + (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (R)); }) + +#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \ + (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(S), \ + (__mmask8)(M), (R)); }) + +#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \ + (__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(M), (R)); }) + +#define _mm_rsqrt28_sd(A, B) \ + _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_sd(S, M, A, B) \ + _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_sd(M, A, B) \ + _mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) // rcp28 -static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) -_mm512_rcp28_round_pd (__m512d __A, int __R) -{ - return (__m512d)__builtin_ia32_rcp28pd_mask ((__v8df)__A, - (__v8df)_mm512_setzero_pd(), - (__mmask8)-1, - __R); -} - -static __inline__ __m512 __attribute__((__always_inline__, __nodebug__)) -_mm512_rcp28_round_ps (__m512 __A, int __R) -{ - return (__m512)__builtin_ia32_rcp28ps_mask ((__v16sf)__A, - (__v16sf)_mm512_setzero_ps (), - (__mmask16)-1, - __R); -} - -static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) -_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R) -{ - return (__m128) __builtin_ia32_rcp28ss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1, - __R); -} -static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) -_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R) -{ - return (__m128d) __builtin_ia32_rcp28sd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1, - __R); -} +#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \ + (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (R)); }) + +#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \ + (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), \ + (__mmask8)(M), (R)); }) + +#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \ + (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (R)); }) + +#define _mm512_rcp28_pd(A) \ + _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_pd(S, M, A) \ + _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_pd(M, A) \ + _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \ + (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (R)); }) + +#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \ + (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), \ + (__mmask16)(M), (R)); }) + +#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \ + (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (R)); }) + +#define _mm512_rcp28_ps(A) \ + _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_ps(S, M, A) \ + _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_ps(M, A) \ + _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \ + (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (R)); }) + +#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \ + (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(S), \ + (__mmask8)(M), (R)); }) + +#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \ + (__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(M), (R)); }) + +#define _mm_rcp28_ss(A, B) \ + _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rcp28_ss(S, M, A, B) \ + _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_ss(M, A, B) \ + _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \ + (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (R)); }) + +#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \ + (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(S), \ + (__mmask8)(M), (R)); }) + +#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \ + (__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(M), (R)); }) + +#define _mm_rcp28_sd(A, B) \ + _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rcp28_sd(S, M, A, B) \ + _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_sd(M, A, B) \ + _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) #endif // __AVX512ERINTRIN_H diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h index 9c80710..d299704 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h @@ -162,6 +162,224 @@ _mm512_castps512_ps128(__m512 __a) return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); } +/* Bitwise operators */ +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_and_epi32(__m512i __a, __m512i __b) +{ + return __a & __b; +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a, + (__v16si) __b, + (__v16si) __src, + (__mmask16) __k); +} +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pandd512_mask((__v16si) __a, + (__v16si) __b, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __k); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_and_epi64(__m512i __a, __m512i __b) +{ + return __a & __b; +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a, + (__v8di) __b, + (__v8di) __src, + (__mmask8) __k); +} +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __a, + (__v8di) __b, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __k); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_andnot_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_andnot_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __U); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + __U); +} +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_or_epi32(__m512i __a, __m512i __b) +{ + return __a | __b; +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a, + (__v16si) __b, + (__v16si) __src, + (__mmask16) __k); +} +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pord512_mask((__v16si) __a, + (__v16si) __b, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __k); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_or_epi64(__m512i __a, __m512i __b) +{ + return __a | __b; +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a, + (__v8di) __b, + (__v8di) __src, + (__mmask8) __k); +} +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __a, + (__v8di) __b, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __k); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_xor_epi32(__m512i __a, __m512i __b) +{ + return __a ^ __b; +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a, + (__v16si) __b, + (__v16si) __src, + (__mmask16) __k); +} +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pxord512_mask((__v16si) __a, + (__v16si) __b, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __k); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_xor_epi64(__m512i __a, __m512i __b) +{ + return __a ^ __b; +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a, + (__v8di) __b, + (__v8di) __src, + (__mmask8) __k); +} +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __a, + (__v8di) __b, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __k); +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_and_si512(__m512i __a, __m512i __b) +{ + return __a & __b; +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_or_si512(__m512i __a, __m512i __b) +{ + return __a | __b; +} + +static __inline__ __m512i __attribute__((__always_inline__, __nodebug__)) +_mm512_xor_si512(__m512i __a, __m512i __b) +{ + return __a ^ __b; +} /* Arithmetic */ static __inline __m512d __attribute__((__always_inline__, __nodebug__)) @@ -200,6 +418,106 @@ _mm512_sub_ps(__m512 __a, __m512 __b) return __a - __b; } +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_add_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8di) __A + (__v8di) __B); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_sub_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8di) __A - (__v8di) __B); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_add_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16si) __A + (__v16si) __B); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_sub_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16si) __A - (__v16si) __B); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +static __inline__ __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) _mm512_max_pd(__m512d __A, __m512d __B) { @@ -337,6 +655,24 @@ _mm512_mul_epi32(__m512i __X, __m512i __Y) } static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) __W, __M); +} + +static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) _mm512_mul_epu32(__m512i __X, __m512i __Y) { return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, @@ -346,6 +682,48 @@ _mm512_mul_epu32(__m512i __X, __m512i __Y) (__mmask8) -1); } +static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) __W, __M); +} + +static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mullo_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16si) __A * (__v16si) __B); +} + +static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) _mm512_sqrt_pd(__m512d a) { @@ -492,20 +870,13 @@ _mm512_abs_epi32(__m512i __A) (__mmask16) -1); } -static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) -_mm512_roundscale_ps(__m512 __A, const int __imm) -{ - return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, - (__v16sf) __A, -1, - _MM_FROUND_CUR_DIRECTION); -} -static __inline __m512d __attribute__ ((__always_inline__, __nodebug__)) -_mm512_roundscale_pd(__m512d __A, const int __imm) -{ - return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, - (__v8df) __A, -1, - _MM_FROUND_CUR_DIRECTION); -} +#define _mm512_roundscale_ps(A, B) __extension__ ({ \ + (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(A), (B), (__v16sf)(A), \ + -1, _MM_FROUND_CUR_DIRECTION); }) + +#define _mm512_roundscale_pd(A, B) __extension__ ({ \ + (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(A), (B), (__v8df)(A), \ + -1, _MM_FROUND_CUR_DIRECTION); }) static __inline__ __m512d __attribute__((__always_inline__, __nodebug__)) _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) @@ -613,25 +984,35 @@ _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) (__mmask16) -1); } -static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) -_mm512_valign_epi64(__m512i __A, __m512i __B, const int __I) -{ - return (__m512i) __builtin_ia32_alignq512_mask((__v8di)__A, - (__v8di)__B, - __I, - (__v8di)_mm512_setzero_si512(), - (__mmask8) -1); -} - -static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) -_mm512_valign_epi32(__m512i __A, __m512i __B, const int __I) -{ - return (__m512i)__builtin_ia32_alignd512_mask((__v16si)__A, - (__v16si)__B, - __I, - (__v16si)_mm512_setzero_si512(), - (__mmask16) -1); -} +#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_alignq512_mask((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (I), (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1); }) + +#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_alignd512_mask((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (I), (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1); }) + +/* Vector Extract */ + +#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \ + __m512d __A = (A); \ + (__m256d) \ + __builtin_ia32_extractf64x4_mask((__v8df)__A, \ + (I), \ + (__v4df)_mm256_setzero_si256(), \ + (__mmask8) -1); }) + +#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \ + __m512 __A = (A); \ + (__m128) \ + __builtin_ia32_extractf32x4_mask((__v16sf)__A, \ + (I), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8) -1); }) /* Vector Blend */ @@ -669,22 +1050,37 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) /* Compare */ -static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__)) -_mm512_cmp_ps_mask(__m512 a, __m512 b, const int p) -{ - return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) a, - (__v16sf) b, p, (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} +#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (P), (__mmask16)-1, (R)); }) -static __inline __mmask8 __attribute__ ((__always_inline__, __nodebug__)) -_mm512_cmp_pd_mask(__m512d __X, __m512d __Y, const int __P) -{ - return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, - (__v8df) __Y, __P, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} +#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (P), (__mmask16)(U), (R)); }) + +#define _mm512_cmp_ps_mask(A, B, P) \ + _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_cmp_ps_mask(U, A, B, P) \ + _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (P), (__mmask8)-1, (R)); }) + +#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (P), (__mmask8)(U), (R)); }) + +#define _mm512_cmp_pd_mask(A, B, P) \ + _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_cmp_pd_mask(U, A, B, P) \ + _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) /* Conversion */ @@ -698,25 +1094,15 @@ _mm512_cvttps_epu32(__m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline __m512 __attribute__ (( __always_inline__, __nodebug__)) -_mm512_cvt_roundepi32_ps(__m512i __A, const int __R) -{ - return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1, - __R); -} +#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \ + (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (R)); }) -static __inline __m512 __attribute__ (( __always_inline__, __nodebug__)) -_mm512_cvt_roundepu32_ps(__m512i __A, const int __R) -{ - return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1, - __R); -} +#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \ + (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (R)); }) static __inline __m512d __attribute__ (( __always_inline__, __nodebug__)) _mm512_cvtepi32_pd(__m256i __A) @@ -735,25 +1121,16 @@ _mm512_cvtepu32_pd(__m256i __A) _mm512_setzero_pd (), (__mmask8) -1); } -static __inline __m256 __attribute__ (( __always_inline__, __nodebug__)) -_mm512_cvt_roundpd_ps(__m512d __A, const int __R) -{ - return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1, - __R); -} -static __inline __m256i __attribute__ ((__always_inline__, __nodebug__)) -_mm512_cvtps_ph(__m512 __A, const int __I) -{ - return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, - __I, - (__v16hi) - _mm256_setzero_si256 (), - -1); -} +#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \ + (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1, (R)); }) + +#define _mm512_cvtps_ph(A, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(A), (I), \ + (__v16hi)_mm256_setzero_si256(), \ + -1); }) static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) _mm512_cvtph_ps(__m256i __A) @@ -783,61 +1160,35 @@ _mm512_cvttpd_epi32(__m512d a) _MM_FROUND_CUR_DIRECTION); } -static __inline __m256i __attribute__ ((__always_inline__, __nodebug__)) -_mm512_cvtt_roundpd_epi32(__m512d __A, const int __R) -{ - return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1, - __R); -} -static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) -_mm512_cvtt_roundps_epi32(__m512 __A, const int __R) -{ - return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1, - __R); -} +#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \ + (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)-1, (R)); }) -static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) -_mm512_cvt_roundps_epi32(__m512 __A, const int __R) -{ - return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1, - __R); -} -static __inline __m256i __attribute__ ((__always_inline__, __nodebug__)) -_mm512_cvt_roundpd_epi32(__m512d __A, const int __R) -{ - return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1, - __R); -} -static __inline __m512i __attribute__ ((__always_inline__, __nodebug__)) -_mm512_cvt_roundps_epu32(__m512 __A, const int __R) -{ - return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1, - __R); -} -static __inline __m256i __attribute__ ((__always_inline__, __nodebug__)) -_mm512_cvt_roundpd_epu32(__m512d __A, const int __R) -{ - return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1, - __R); -} +#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \ + (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1, (R)); }) + +#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \ + (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1, (R)); }) + +#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \ + (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)-1, (R)); }) + +#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \ + (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1, (R)); }) + +#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \ + (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8) -1, (R)); }) /* Unpack and Interleave */ static __inline __m512d __attribute__((__always_inline__, __nodebug__)) @@ -928,12 +1279,30 @@ _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) (__mmask8) __U); } +static __inline __m512 __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_load_ps(__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +static __inline __m512d __attribute__ ((__always_inline__, __nodebug__)) +_mm512_maskz_load_pd(__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + static __inline __m512d __attribute__((__always_inline__, __nodebug__)) _mm512_loadu_pd(double const *__p) { struct __loadu_pd { __m512d __v; - } __attribute__((packed, may_alias)); + } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_pd*)__p)->__v; } @@ -942,10 +1311,28 @@ _mm512_loadu_ps(float const *__p) { struct __loadu_ps { __m512 __v; - } __attribute__((packed, may_alias)); + } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_ps*)__p)->__v; } +static __inline __m512 __attribute__((__always_inline__, __nodebug__)) +_mm512_load_ps(double const *__p) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +static __inline __m512d __attribute__((__always_inline__, __nodebug__)) +_mm512_load_pd(float const *__p) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + /* SIMD store ops */ static __inline void __attribute__ ((__always_inline__, __nodebug__)) @@ -988,9 +1375,9 @@ _mm512_storeu_ps(void *__P, __m512 __A) } static __inline void __attribute__ ((__always_inline__, __nodebug__)) -_mm512_store_ps(void *__P, __m512 __A) +_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) { - *(__m512*)__P = __A; + __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U); } static __inline void __attribute__ ((__always_inline__, __nodebug__)) @@ -999,6 +1386,19 @@ _mm512_store_pd(void *__P, __m512d __A) *(__m512d*)__P = __A; } +static __inline void __attribute__ ((__always_inline__, __nodebug__)) +_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A, + (__mmask16) __U); +} + +static __inline void __attribute__ ((__always_inline__, __nodebug__)) +_mm512_store_ps(void *__P, __m512 __A) +{ + *(__m512*)__P = __A; +} + /* Mask ops */ static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__)) @@ -1021,6 +1421,18 @@ _mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { __u); } +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0, + __u); +} + static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) _mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b, @@ -1033,4 +1445,303 @@ _mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) { (__mmask8)-1); } +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4, + __u); +} + +#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \ + __m512i __a = (a); \ + __m512i __b = (b); \ + (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, (p), \ + (__mmask16)-1); }) + +#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \ + __m512i __a = (a); \ + __m512i __b = (b); \ + (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, (p), \ + (__mmask16)-1); }) + +#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \ + __m512i __a = (a); \ + __m512i __b = (b); \ + (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, (p), \ + (__mmask8)-1); }) + +#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \ + __m512i __a = (a); \ + __m512i __b = (b); \ + (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, (p), \ + (__mmask8)-1); }) + +#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \ + __m512i __a = (a); \ + __m512i __b = (b); \ + (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, (p), \ + (__mmask16)(m)); }) + +#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \ + __m512i __a = (a); \ + __m512i __b = (b); \ + (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, (p), \ + (__mmask16)(m)); }) + +#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \ + __m512i __a = (a); \ + __m512i __b = (b); \ + (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, (p), \ + (__mmask8)(m)); }) + +#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \ + __m512i __a = (a); \ + __m512i __b = (b); \ + (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, (p), \ + (__mmask8)(m)); }) #endif // __AVX512FINTRIN_H diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512vlbwintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512vlbwintrin.h index 11333f8..c3b087e 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512vlbwintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512vlbwintrin.h @@ -42,6 +42,17 @@ _mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { __u); } +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0, + __u); +} static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) _mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) { @@ -55,6 +66,18 @@ _mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { __u); } +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0, + __u); +} + static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) _mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) { return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b, @@ -67,6 +90,17 @@ _mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { __u); } +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0, + __u); +} static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) _mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) { @@ -80,4 +114,744 @@ _mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { __u); } +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmple_epi8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmple_epu8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmple_epi16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmple_epu16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, + __u); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, + (__mmask32)-1); +} + +static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, + __u); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, + (__mmask16)-1); +} + +static __inline__ __mmask16 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, + __u); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ + return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), \ + (p), (__mmask16)-1); }) + +#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), \ + (p), (__mmask16)(m)); }) + +#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), \ + (p), (__mmask16)-1); }) + +#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), \ + (p), (__mmask16)(m)); }) + +#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), \ + (p), (__mmask32)-1); }) + +#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), \ + (p), (__mmask32)(m)); }) + +#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), \ + (p), (__mmask32)-1); }) + +#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), \ + (p), (__mmask32)(m)); }) + +#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), \ + (p), (__mmask16)-1); }) + +#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), \ + (p), (__mmask16)(m)); }) + +#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), \ + (p), (__mmask16)-1); }) + +#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), \ + (p), (__mmask16)(m)); }) + #endif /* __AVX512VLBWINTRIN_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512vldqintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512vldqintrin.h new file mode 100644 index 0000000..4024446 --- /dev/null +++ b/contrib/llvm/tools/clang/lib/Headers/avx512vldqintrin.h @@ -0,0 +1,349 @@ +/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ---------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512VLDQINTRIN_H +#define __AVX512VLDQINTRIN_H + + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mullo_epi64 (__m256i __A, __m256i __B) { + return (__m256i) ((__v4di) __A * (__v4di) __B); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mullo_epi64 (__m128i __A, __m128i __B) { + return (__m128i) ((__v2di) __A * (__v2di) __B); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { + return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +#endif diff --git a/contrib/llvm/tools/clang/lib/Headers/avx512vlintrin.h b/contrib/llvm/tools/clang/lib/Headers/avx512vlintrin.h index 8a374b1..9de0cf4 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avx512vlintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avx512vlintrin.h @@ -42,6 +42,17 @@ _mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { __u); } +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0, + __u); +} static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) _mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) { @@ -56,6 +67,18 @@ _mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { } static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) _mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) { return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b, (__mmask8)-1); @@ -67,6 +90,17 @@ _mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { __u); } +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0, + __u); +} static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) _mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) { @@ -80,4 +114,1206 @@ _mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { __u); } +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0, + __u); +} + + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5, + __u); +} + + + + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmple_epi32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmple_epu32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmple_epi64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmple_epu64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4, + __u); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4, + (__mmask8)-1); +} + +static __inline__ __mmask8 __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4, + __u); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) __W, __M); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) __W, __M); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) + _mm_setzero_si128 (), + __M); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) __W, __M); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) __W, __M); +} + +static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) +_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) + _mm_setzero_si128 (), + __M); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_mullo_epi32 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_pd (), + __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_pd (), + __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_pd (), + __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_pd (), + __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __attribute__ ((__always_inline__, __nodebug__)) +_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __attribute__ ((__always_inline__, __nodebug__)) +_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +#define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), \ + (p), (__mmask8)-1); }) + +#define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm256_cmp_ps_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), \ + (p), (__mmask8)-1); }) + +#define _mm256_mask_cmp_ps_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm256_cmp_pd_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \ + (__v4df)(__m256)(b), \ + (p), (__mmask8)-1); }) + +#define _mm256_mask_cmp_pd_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256)(a), \ + (__v4df)(__m256)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm128_cmp_ps_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), \ + (p), (__mmask8)-1); }) + +#define _mm128_mask_cmp_ps_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), \ + (p), (__mmask8)(m)); }) + +#define _mm128_cmp_pd_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \ + (__v2df)(__m128)(b), \ + (p), (__mmask8)-1); }) + +#define _mm128_mask_cmp_pd_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128)(a), \ + (__v2df)(__m128)(b), \ + (p), (__mmask8)(m)); }) #endif /* __AVX512VLINTRIN_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/avxintrin.h b/contrib/llvm/tools/clang/lib/Headers/avxintrin.h index 4e1044a..4907965 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avxintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avxintrin.h @@ -257,8 +257,7 @@ _mm_permutevar_ps(__m128 __a, __m128i __c) static __inline __m256 __attribute__((__always_inline__, __nodebug__)) _mm256_permutevar_ps(__m256 __a, __m256i __c) { - return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, - (__v8si)__c); + return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); } #define _mm_permute_pd(A, C) __extension__ ({ \ @@ -430,35 +429,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) __m128 __b = (b); \ (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) -/* Vector extract */ -#define _mm256_extractf128_pd(A, O) __extension__ ({ \ - __m256d __A = (A); \ - (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); }) - -#define _mm256_extractf128_ps(A, O) __extension__ ({ \ - __m256 __A = (A); \ - (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); }) - -#define _mm256_extractf128_si256(A, O) __extension__ ({ \ - __m256i __A = (A); \ - (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); }) - static __inline int __attribute__((__always_inline__, __nodebug__)) -_mm256_extract_epi32(__m256i __a, int const __imm) +_mm256_extract_epi32(__m256i __a, const int __imm) { __v8si __b = (__v8si)__a; return __b[__imm & 7]; } static __inline int __attribute__((__always_inline__, __nodebug__)) -_mm256_extract_epi16(__m256i __a, int const __imm) +_mm256_extract_epi16(__m256i __a, const int __imm) { __v16hi __b = (__v16hi)__a; return __b[__imm & 15]; } static __inline int __attribute__((__always_inline__, __nodebug__)) -_mm256_extract_epi8(__m256i __a, int const __imm) +_mm256_extract_epi8(__m256i __a, const int __imm) { __v32qi __b = (__v32qi)__a; return __b[__imm & 31]; @@ -473,22 +459,6 @@ _mm256_extract_epi64(__m256i __a, const int __imm) } #endif -/* Vector insert */ -#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \ - __m256d __V1 = (V1); \ - __m128d __V2 = (V2); \ - (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); }) - -#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \ - __m256 __V1 = (V1); \ - __m128 __V2 = (V2); \ - (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); }) - -#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \ - __m256i __V1 = (V1); \ - __m128i __V2 = (V2); \ - (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); }) - static __inline __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_insert_epi32(__m256i __a, int __b, int const __imm) { @@ -515,7 +485,7 @@ _mm256_insert_epi8(__m256i __a, int __b, int const __imm) #ifdef __x86_64__ static __inline __m256i __attribute__((__always_inline__, __nodebug__)) -_mm256_insert_epi64(__m256i __a, int __b, int const __imm) +_mm256_insert_epi64(__m256i __a, long long __b, int const __imm) { __v4di __c = (__v4di)__a; __c[__imm & 3] = __b; @@ -785,7 +755,7 @@ _mm256_loadu_pd(double const *__p) { struct __loadu_pd { __m256d __v; - } __attribute__((packed, may_alias)); + } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_pd*)__p)->__v; } @@ -794,7 +764,7 @@ _mm256_loadu_ps(float const *__p) { struct __loadu_ps { __m256 __v; - } __attribute__((packed, may_alias)); + } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_ps*)__p)->__v; } @@ -809,7 +779,7 @@ _mm256_loadu_si256(__m256i const *__p) { struct __loadu_si256 { __m256i __v; - } __attribute__((packed, may_alias)); + } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_si256*)__p)->__v; } @@ -935,23 +905,23 @@ _mm256_set_pd(double __a, double __b, double __c, double __d) static __inline __m256 __attribute__((__always_inline__, __nodebug__)) _mm256_set_ps(float __a, float __b, float __c, float __d, - float __e, float __f, float __g, float __h) + float __e, float __f, float __g, float __h) { return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; } static __inline __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, - int __i4, int __i5, int __i6, int __i7) + int __i4, int __i5, int __i6, int __i7) { return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; } static __inline __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, - short __w11, short __w10, short __w09, short __w08, - short __w07, short __w06, short __w05, short __w04, - short __w03, short __w02, short __w01, short __w00) + short __w11, short __w10, short __w09, short __w08, + short __w07, short __w06, short __w05, short __w04, + short __w03, short __w02, short __w01, short __w00) { return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; @@ -959,13 +929,13 @@ _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, static __inline __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, - char __b27, char __b26, char __b25, char __b24, - char __b23, char __b22, char __b21, char __b20, - char __b19, char __b18, char __b17, char __b16, - char __b15, char __b14, char __b13, char __b12, - char __b11, char __b10, char __b09, char __b08, - char __b07, char __b06, char __b05, char __b04, - char __b03, char __b02, char __b01, char __b00) + char __b27, char __b26, char __b25, char __b24, + char __b23, char __b22, char __b21, char __b20, + char __b19, char __b18, char __b17, char __b16, + char __b15, char __b14, char __b13, char __b12, + char __b11, char __b10, char __b09, char __b08, + char __b07, char __b06, char __b05, char __b04, + char __b03, char __b02, char __b01, char __b00) { return (__m256i)(__v32qi){ __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, @@ -990,23 +960,23 @@ _mm256_setr_pd(double __a, double __b, double __c, double __d) static __inline __m256 __attribute__((__always_inline__, __nodebug__)) _mm256_setr_ps(float __a, float __b, float __c, float __d, - float __e, float __f, float __g, float __h) + float __e, float __f, float __g, float __h) { return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; } static __inline __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, - int __i4, int __i5, int __i6, int __i7) + int __i4, int __i5, int __i6, int __i7) { return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; } static __inline __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, - short __w11, short __w10, short __w09, short __w08, - short __w07, short __w06, short __w05, short __w04, - short __w03, short __w02, short __w01, short __w00) + short __w11, short __w10, short __w09, short __w08, + short __w07, short __w06, short __w05, short __w04, + short __w03, short __w02, short __w01, short __w00) { return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; @@ -1014,19 +984,19 @@ _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, static __inline __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, - char __b27, char __b26, char __b25, char __b24, - char __b23, char __b22, char __b21, char __b20, - char __b19, char __b18, char __b17, char __b16, - char __b15, char __b14, char __b13, char __b12, - char __b11, char __b10, char __b09, char __b08, - char __b07, char __b06, char __b05, char __b04, - char __b03, char __b02, char __b01, char __b00) + char __b27, char __b26, char __b25, char __b24, + char __b23, char __b22, char __b21, char __b20, + char __b19, char __b18, char __b17, char __b16, + char __b15, char __b14, char __b13, char __b12, + char __b11, char __b10, char __b09, char __b08, + char __b07, char __b06, char __b05, char __b04, + char __b03, char __b02, char __b01, char __b00) { return (__m256i)(__v32qi){ __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, - __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, - __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, - __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; + __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, + __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, + __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; } static __inline __m256i __attribute__((__always_inline__, __nodebug__)) @@ -1167,6 +1137,70 @@ _mm256_castsi128_si256(__m128i __a) return __builtin_shufflevector(__a, __a, 0, 1, -1, -1); } +/* + Vector insert. + We use macros rather than inlines because we only want to accept + invocations where the immediate M is a constant expression. +*/ +#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ + (__m256)__builtin_shufflevector( \ + (__v8sf)(V1), \ + (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ + (((M) & 1) ? 0 : 8), \ + (((M) & 1) ? 1 : 9), \ + (((M) & 1) ? 2 : 10), \ + (((M) & 1) ? 3 : 11), \ + (((M) & 1) ? 8 : 4), \ + (((M) & 1) ? 9 : 5), \ + (((M) & 1) ? 10 : 6), \ + (((M) & 1) ? 11 : 7) );}) + +#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ + (__m256d)__builtin_shufflevector( \ + (__v4df)(V1), \ + (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ + (((M) & 1) ? 0 : 4), \ + (((M) & 1) ? 1 : 5), \ + (((M) & 1) ? 4 : 2), \ + (((M) & 1) ? 5 : 3) );}) + +#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ + (__m256i)__builtin_shufflevector( \ + (__v4di)(V1), \ + (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ + (((M) & 1) ? 0 : 4), \ + (((M) & 1) ? 1 : 5), \ + (((M) & 1) ? 4 : 2), \ + (((M) & 1) ? 5 : 3) );}) + +/* + Vector extract. + We use macros rather than inlines because we only want to accept + invocations where the immediate M is a constant expression. +*/ +#define _mm256_extractf128_ps(V, M) __extension__ ({ \ + (__m128)__builtin_shufflevector( \ + (__v8sf)(V), \ + (__v8sf)(_mm256_setzero_ps()), \ + (((M) & 1) ? 4 : 0), \ + (((M) & 1) ? 5 : 1), \ + (((M) & 1) ? 6 : 2), \ + (((M) & 1) ? 7 : 3) );}) + +#define _mm256_extractf128_pd(V, M) __extension__ ({ \ + (__m128d)__builtin_shufflevector( \ + (__v4df)(V), \ + (__v4df)(_mm256_setzero_pd()), \ + (((M) & 1) ? 2 : 0), \ + (((M) & 1) ? 3 : 1) );}) + +#define _mm256_extractf128_si256(V, M) __extension__ ({ \ + (__m128i)__builtin_shufflevector( \ + (__v4di)(V), \ + (__v4di)(_mm256_setzero_si256()), \ + (((M) & 1) ? 2 : 0), \ + (((M) & 1) ? 3 : 1) );}) + /* SIMD load ops (unaligned) */ static __inline __m256 __attribute__((__always_inline__, __nodebug__)) _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) @@ -1195,7 +1229,7 @@ _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) { struct __loadu_si128 { __m128i __v; - } __attribute__((packed, may_alias)); + } __attribute__((__packed__, __may_alias__)); __m256i __v256 = _mm256_castsi128_si256( ((struct __loadu_si128*)__addr_lo)->__v); return _mm256_insertf128_si256(__v256, @@ -1236,4 +1270,34 @@ _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) __builtin_ia32_storedqu((char *)__addr_hi, (__v16qi)__v128); } +static __inline __m256 __attribute__((__always_inline__, __nodebug__)) +_mm256_set_m128 (__m128 __hi, __m128 __lo) { + return (__m256) __builtin_shufflevector(__lo, __hi, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline __m256d __attribute__((__always_inline__, __nodebug__)) +_mm256_set_m128d (__m128d __hi, __m128d __lo) { + return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); +} + +static __inline __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_set_m128i (__m128i __hi, __m128i __lo) { + return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); +} + +static __inline __m256 __attribute__((__always_inline__, __nodebug__)) +_mm256_setr_m128 (__m128 __lo, __m128 __hi) { + return _mm256_set_m128(__hi, __lo); +} + +static __inline __m256d __attribute__((__always_inline__, __nodebug__)) +_mm256_setr_m128d (__m128d __lo, __m128d __hi) { + return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); +} + +static __inline __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_setr_m128i (__m128i __lo, __m128i __hi) { + return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); +} + #endif /* __AVXINTRIN_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/cuda_builtin_vars.h b/contrib/llvm/tools/clang/lib/Headers/cuda_builtin_vars.h new file mode 100644 index 0000000..901356b --- /dev/null +++ b/contrib/llvm/tools/clang/lib/Headers/cuda_builtin_vars.h @@ -0,0 +1,110 @@ +/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CUDA_BUILTIN_VARS_H +#define __CUDA_BUILTIN_VARS_H + +// The file implements built-in CUDA variables using __declspec(property). +// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx +// All read accesses of built-in variable fields get converted into calls to a +// getter function which in turn would call appropriate builtin to fetch the +// value. +// +// Example: +// int x = threadIdx.x; +// IR output: +// %0 = call i32 @llvm.ptx.read.tid.x() #3 +// PTX output: +// mov.u32 %r2, %tid.x; + +#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC) \ + __declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD; \ + static inline __attribute__((always_inline)) \ + __attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) { \ + return INTRINSIC; \ + } + +#if __cplusplus >= 201103L +#define __DELETE =delete +#else +#define __DELETE +#endif + +// Make sure nobody can create instances of the special varible types. nvcc +// also disallows taking address of special variables, so we disable address-of +// operator as well. +#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName) \ + __attribute__((device)) TypeName() __DELETE; \ + __attribute__((device)) TypeName(const TypeName &) __DELETE; \ + __attribute__((device)) void operator=(const TypeName &) const __DELETE; \ + __attribute__((device)) TypeName *operator&() const __DELETE + +struct __cuda_builtin_threadIdx_t { + __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_tid_x()); + __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_tid_y()); + __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_tid_z()); +private: + __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t); +}; + +struct __cuda_builtin_blockIdx_t { + __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ctaid_x()); + __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ctaid_y()); + __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ctaid_z()); +private: + __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t); +}; + +struct __cuda_builtin_blockDim_t { + __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ntid_x()); + __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ntid_y()); + __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ntid_z()); +private: + __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t); +}; + +struct __cuda_builtin_gridDim_t { + __CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_nctaid_x()); + __CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_nctaid_y()); + __CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_nctaid_z()); +private: + __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t); +}; + +#define __CUDA_BUILTIN_VAR \ + extern const __attribute__((device)) __attribute__((weak)) +__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx; +__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx; +__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim; +__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim; + +// warpSize should translate to read of %WARP_SZ but there's currently no +// builtin to do so. According to PTX v4.2 docs 'to date, all target +// architectures have a WARP_SZ value of 32'. +__attribute__((device)) const int warpSize = 32; + +#undef __CUDA_DEVICE_BUILTIN +#undef __CUDA_BUILTIN_VAR +#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS + +#endif /* __CUDA_BUILTIN_VARS_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/emmintrin.h b/contrib/llvm/tools/clang/lib/Headers/emmintrin.h index 28d0043..c764d68 100644 --- a/contrib/llvm/tools/clang/lib/Headers/emmintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/emmintrin.h @@ -489,7 +489,7 @@ _mm_loadu_pd(double const *__dp) { struct __loadu_pd { __m128d __v; - } __attribute__((packed, may_alias)); + } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_pd*)__dp)->__v; } @@ -825,11 +825,28 @@ _mm_xor_si128(__m128i __a, __m128i __b) return __a ^ __b; } -#define _mm_slli_si128(a, count) __extension__ ({ \ - _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ - __m128i __a = (a); \ - _Pragma("clang diagnostic pop"); \ - (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); }) +#define _mm_slli_si128(a, imm) __extension__ ({ \ + (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \ + (__v16qi)(__m128i)(a), \ + ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \ + ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); }) + +#define _mm_bslli_si128(a, imm) \ + _mm_slli_si128((a), (imm)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) _mm_slli_epi16(__m128i __a, int __count) @@ -891,12 +908,28 @@ _mm_sra_epi32(__m128i __a, __m128i __count) return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); } - -#define _mm_srli_si128(a, count) __extension__ ({ \ - _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ - __m128i __a = (a); \ - _Pragma("clang diagnostic pop"); \ - (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); }) +#define _mm_srli_si128(a, imm) __extension__ ({ \ + (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \ + (__v16qi)_mm_setzero_si128(), \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \ + ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); }) + +#define _mm_bsrli_si128(a, imm) \ + _mm_srli_si128((a), (imm)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) _mm_srli_epi16(__m128i __a, int __count) @@ -1070,7 +1103,7 @@ _mm_loadu_si128(__m128i const *__p) { struct __loadu_si128 { __m128i __v; - } __attribute__((packed, may_alias)); + } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_si128*)__p)->__v; } @@ -1284,27 +1317,21 @@ _mm_movemask_epi8(__m128i __a) } #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ - _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ - __m128i __a = (a); \ - _Pragma("clang diagnostic pop"); \ - (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \ + (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ + (__v4si)_mm_set1_epi32(0), \ (imm) & 0x3, ((imm) & 0xc) >> 2, \ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ - _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ - __m128i __a = (a); \ - _Pragma("clang diagnostic pop"); \ - (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ + (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ + (__v8hi)_mm_set1_epi16(0), \ (imm) & 0x3, ((imm) & 0xc) >> 2, \ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 4, 5, 6, 7); }) #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ - _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ - __m128i __a = (a); \ - _Pragma("clang diagnostic pop"); \ - (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ + (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ + (__v8hi)_mm_set1_epi16(0), \ 0, 1, 2, 3, \ 4 + (((imm) & 0x03) >> 0), \ 4 + (((imm) & 0x0c) >> 2), \ @@ -1396,11 +1423,8 @@ _mm_movemask_pd(__m128d __a) } #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ - _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wshadow\""); \ - __m128d __a = (a); \ - __m128d __b = (b); \ - _Pragma("clang diagnostic pop"); \ - __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); }) + __builtin_shufflevector((__m128d)(a), (__m128d)(b), \ + (i) & 1, (((i) & 2) >> 1) + 2); }) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) _mm_castpd_ps(__m128d __a) diff --git a/contrib/llvm/tools/clang/lib/Headers/htmintrin.h b/contrib/llvm/tools/clang/lib/Headers/htmintrin.h new file mode 100644 index 0000000..0088c7c --- /dev/null +++ b/contrib/llvm/tools/clang/lib/Headers/htmintrin.h @@ -0,0 +1,226 @@ +/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * +\*===----------------------------------------------------------------------===*/ + +#ifndef __HTMINTRIN_H +#define __HTMINTRIN_H + +#ifndef __HTM__ +#error "HTM instruction set not enabled" +#endif + +#ifdef __powerpc__ + +#include <stdint.h> + +typedef uint64_t texasr_t; +typedef uint32_t texasru_t; +typedef uint32_t texasrl_t; +typedef uintptr_t tfiar_t; +typedef uintptr_t tfhar_t; + +#define _HTM_STATE(CR0) ((CR0 >> 1) & 0x3) +#define _HTM_NONTRANSACTIONAL 0x0 +#define _HTM_SUSPENDED 0x1 +#define _HTM_TRANSACTIONAL 0x2 + +#define _TEXASR_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \ + (((TEXASR) >> (63-(BITNUM))) & ((1<<(SIZE))-1)) +#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \ + (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1)) + +#define _TEXASR_FAILURE_CODE(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 7, 8) +#define _TEXASRU_FAILURE_CODE(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 8) + +#define _TEXASR_FAILURE_PERSISTENT(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 7, 1) +#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1) + +#define _TEXASR_DISALLOWED(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 8, 1) +#define _TEXASRU_DISALLOWED(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 8, 1) + +#define _TEXASR_NESTING_OVERFLOW(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 9, 1) +#define _TEXASRU_NESTING_OVERFLOW(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 9, 1) + +#define _TEXASR_FOOTPRINT_OVERFLOW(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 10, 1) +#define _TEXASRU_FOOTPRINT_OVERFLOW(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 10, 1) + +#define _TEXASR_SELF_INDUCED_CONFLICT(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 11, 1) +#define _TEXASRU_SELF_INDUCED_CONFLICT(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 11, 1) + +#define _TEXASR_NON_TRANSACTIONAL_CONFLICT(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 12, 1) +#define _TEXASRU_NON_TRANSACTIONAL_CONFLICT(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 12, 1) + +#define _TEXASR_TRANSACTION_CONFLICT(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 13, 1) +#define _TEXASRU_TRANSACTION_CONFLICT(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 13, 1) + +#define _TEXASR_TRANSLATION_INVALIDATION_CONFLICT(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 14, 1) +#define _TEXASRU_TRANSLATION_INVALIDATION_CONFLICT(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 14, 1) + +#define _TEXASR_IMPLEMENTAION_SPECIFIC(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 15, 1) +#define _TEXASRU_IMPLEMENTAION_SPECIFIC(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 15, 1) + +#define _TEXASR_INSTRUCTION_FETCH_CONFLICT(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 16, 1) +#define _TEXASRU_INSTRUCTION_FETCH_CONFLICT(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 16, 1) + +#define _TEXASR_ABORT(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 31, 1) +#define _TEXASRU_ABORT(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 31, 1) + + +#define _TEXASR_SUSPENDED(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 32, 1) + +#define _TEXASR_PRIVILEGE(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 35, 2) + +#define _TEXASR_FAILURE_SUMMARY(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 36, 1) + +#define _TEXASR_TFIAR_EXACT(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 37, 1) + +#define _TEXASR_ROT(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 38, 1) + +#define _TEXASR_TRANSACTION_LEVEL(TEXASR) \ + _TEXASR_EXTRACT_BITS(TEXASR, 63, 12) + +#endif /* __powerpc */ + +#ifdef __s390__ + +/* Condition codes generated by tbegin */ +#define _HTM_TBEGIN_STARTED 0 +#define _HTM_TBEGIN_INDETERMINATE 1 +#define _HTM_TBEGIN_TRANSIENT 2 +#define _HTM_TBEGIN_PERSISTENT 3 + +/* The abort codes below this threshold are reserved for machine use. */ +#define _HTM_FIRST_USER_ABORT_CODE 256 + +/* The transaction diagnostic block is it is defined in the Principles + of Operation chapter 5-91. */ + +struct __htm_tdb { + unsigned char format; /* 0 */ + unsigned char flags; + unsigned char reserved1[4]; + unsigned short nesting_depth; + unsigned long long abort_code; /* 8 */ + unsigned long long conflict_token; /* 16 */ + unsigned long long atia; /* 24 */ + unsigned char eaid; /* 32 */ + unsigned char dxc; + unsigned char reserved2[2]; + unsigned int program_int_id; + unsigned long long exception_id; /* 40 */ + unsigned long long bea; /* 48 */ + unsigned char reserved3[72]; /* 56 */ + unsigned long long gprs[16]; /* 128 */ +} __attribute__((__packed__, __aligned__ (8))); + + +/* Helper intrinsics to retry tbegin in case of transient failure. */ + +static __inline int __attribute__((__always_inline__, __nodebug__)) +__builtin_tbegin_retry_null (int retry) +{ + int cc, i = 0; + + while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT + && i++ < retry) + __builtin_tx_assist(i); + + return cc; +} + +static __inline int __attribute__((__always_inline__, __nodebug__)) +__builtin_tbegin_retry_tdb (void *tdb, int retry) +{ + int cc, i = 0; + + while ((cc = __builtin_tbegin(tdb)) == _HTM_TBEGIN_TRANSIENT + && i++ < retry) + __builtin_tx_assist(i); + + return cc; +} + +#define __builtin_tbegin_retry(tdb, retry) \ + (__builtin_constant_p(tdb == 0) && tdb == 0 ? \ + __builtin_tbegin_retry_null(retry) : \ + __builtin_tbegin_retry_tdb(tdb, retry)) + +static __inline int __attribute__((__always_inline__, __nodebug__)) +__builtin_tbegin_retry_nofloat_null (int retry) +{ + int cc, i = 0; + + while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT + && i++ < retry) + __builtin_tx_assist(i); + + return cc; +} + +static __inline int __attribute__((__always_inline__, __nodebug__)) +__builtin_tbegin_retry_nofloat_tdb (void *tdb, int retry) +{ + int cc, i = 0; + + while ((cc = __builtin_tbegin_nofloat(tdb)) == _HTM_TBEGIN_TRANSIENT + && i++ < retry) + __builtin_tx_assist(i); + + return cc; +} + +#define __builtin_tbegin_retry_nofloat(tdb, retry) \ + (__builtin_constant_p(tdb == 0) && tdb == 0 ? \ + __builtin_tbegin_retry_nofloat_null(retry) : \ + __builtin_tbegin_retry_nofloat_tdb(tdb, retry)) + +#endif /* __s390__ */ + +#endif /* __HTMINTRIN_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/htmxlintrin.h b/contrib/llvm/tools/clang/lib/Headers/htmxlintrin.h new file mode 100644 index 0000000..30f524d --- /dev/null +++ b/contrib/llvm/tools/clang/lib/Headers/htmxlintrin.h @@ -0,0 +1,363 @@ +/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * +\*===----------------------------------------------------------------------===*/ + +#ifndef __HTMXLINTRIN_H +#define __HTMXLINTRIN_H + +#ifndef __HTM__ +#error "HTM instruction set not enabled" +#endif + +#include <htmintrin.h> + +#ifdef __powerpc__ + +#ifdef __cplusplus +extern "C" { +#endif + +#define _TEXASR_PTR(TM_BUF) \ + ((texasr_t *)((TM_BUF)+0)) +#define _TEXASRU_PTR(TM_BUF) \ + ((texasru_t *)((TM_BUF)+0)) +#define _TEXASRL_PTR(TM_BUF) \ + ((texasrl_t *)((TM_BUF)+4)) +#define _TFIAR_PTR(TM_BUF) \ + ((tfiar_t *)((TM_BUF)+8)) + +typedef char TM_buff_type[16]; + +/* This macro can be used to determine whether a transaction was successfully + started from the __TM_begin() and __TM_simple_begin() intrinsic functions + below. */ +#define _HTM_TBEGIN_STARTED 1 + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_simple_begin (void) +{ + if (__builtin_expect (__builtin_tbegin (0), 1)) + return _HTM_TBEGIN_STARTED; + return 0; +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_begin (void* const TM_buff) +{ + *_TEXASRL_PTR (TM_buff) = 0; + if (__builtin_expect (__builtin_tbegin (0), 1)) + return _HTM_TBEGIN_STARTED; +#ifdef __powerpc64__ + *_TEXASR_PTR (TM_buff) = __builtin_get_texasr (); +#else + *_TEXASRU_PTR (TM_buff) = __builtin_get_texasru (); + *_TEXASRL_PTR (TM_buff) = __builtin_get_texasr (); +#endif + *_TFIAR_PTR (TM_buff) = __builtin_get_tfiar (); + return 0; +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_end (void) +{ + if (__builtin_expect (__builtin_tend (0), 1)) + return 1; + return 0; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_abort (void) +{ + __builtin_tabort (0); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_named_abort (unsigned char const code) +{ + __builtin_tabort (code); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_resume (void) +{ + __builtin_tresume (); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_suspend (void) +{ + __builtin_tsuspend (); +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_is_user_abort (void* const TM_buff) +{ + texasru_t texasru = *_TEXASRU_PTR (TM_buff); + return _TEXASRU_ABORT (texasru); +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_is_named_user_abort (void* const TM_buff, unsigned char *code) +{ + texasru_t texasru = *_TEXASRU_PTR (TM_buff); + + *code = _TEXASRU_FAILURE_CODE (texasru); + return _TEXASRU_ABORT (texasru); +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_is_illegal (void* const TM_buff) +{ + texasru_t texasru = *_TEXASRU_PTR (TM_buff); + return _TEXASRU_DISALLOWED (texasru); +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_is_footprint_exceeded (void* const TM_buff) +{ + texasru_t texasru = *_TEXASRU_PTR (TM_buff); + return _TEXASRU_FOOTPRINT_OVERFLOW (texasru); +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_nesting_depth (void* const TM_buff) +{ + texasrl_t texasrl; + + if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL) + { + texasrl = *_TEXASRL_PTR (TM_buff); + if (!_TEXASR_FAILURE_SUMMARY (texasrl)) + texasrl = 0; + } + else + texasrl = (texasrl_t) __builtin_get_texasr (); + + return _TEXASR_TRANSACTION_LEVEL (texasrl); +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_is_nested_too_deep(void* const TM_buff) +{ + texasru_t texasru = *_TEXASRU_PTR (TM_buff); + return _TEXASRU_NESTING_OVERFLOW (texasru); +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_is_conflict(void* const TM_buff) +{ + texasru_t texasru = *_TEXASRU_PTR (TM_buff); + /* Return TEXASR bits 11 (Self-Induced Conflict) through + 14 (Translation Invalidation Conflict). */ + return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0; +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_is_failure_persistent(void* const TM_buff) +{ + texasru_t texasru = *_TEXASRU_PTR (TM_buff); + return _TEXASRU_FAILURE_PERSISTENT (texasru); +} + +extern __inline long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_failure_address(void* const TM_buff) +{ + return *_TFIAR_PTR (TM_buff); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__TM_failure_code(void* const TM_buff) +{ + return *_TEXASR_PTR (TM_buff); +} + +#ifdef __cplusplus +} +#endif + +#endif /* __powerpc__ */ + +#ifdef __s390__ + +#include <stdint.h> + +/* These intrinsics are being made available for compatibility with + the IBM XL compiler. For documentation please see the "z/OS XL + C/C++ Programming Guide" publically available on the web. */ + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_simple_begin () +{ + return __builtin_tbegin_nofloat (0); +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_begin (void* const tdb) +{ + return __builtin_tbegin_nofloat (tdb); +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_end () +{ + return __builtin_tend (); +} + +static __inline void __attribute__((__always_inline__)) +__TM_abort () +{ + return __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE); +} + +static __inline void __attribute__((__always_inline__, __nodebug__)) +__TM_named_abort (unsigned char const code) +{ + return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + code); +} + +static __inline void __attribute__((__always_inline__, __nodebug__)) +__TM_non_transactional_store (void* const addr, long long const value) +{ + __builtin_non_tx_store ((uint64_t*)addr, (uint64_t)value); +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_nesting_depth (void* const tdb_ptr) +{ + int depth = __builtin_tx_nesting_depth (); + struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr; + + if (depth != 0) + return depth; + + if (tdb->format != 1) + return 0; + return tdb->nesting_depth; +} + +/* Transaction failure diagnostics */ + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_is_user_abort (void* const tdb_ptr) +{ + struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr; + + if (tdb->format != 1) + return 0; + + return !!(tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE); +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_is_named_user_abort (void* const tdb_ptr, unsigned char* code) +{ + struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr; + + if (tdb->format != 1) + return 0; + + if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE) + { + *code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE; + return 1; + } + return 0; +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_is_illegal (void* const tdb_ptr) +{ + struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr; + + return (tdb->format == 1 + && (tdb->abort_code == 4 /* unfiltered program interruption */ + || tdb->abort_code == 11 /* restricted instruction */)); +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_is_footprint_exceeded (void* const tdb_ptr) +{ + struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr; + + return (tdb->format == 1 + && (tdb->abort_code == 7 /* fetch overflow */ + || tdb->abort_code == 8 /* store overflow */)); +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_is_nested_too_deep (void* const tdb_ptr) +{ + struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr; + + return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */ +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_is_conflict (void* const tdb_ptr) +{ + struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr; + + return (tdb->format == 1 + && (tdb->abort_code == 9 /* fetch conflict */ + || tdb->abort_code == 10 /* store conflict */)); +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_is_failure_persistent (long const result) +{ + return result == _HTM_TBEGIN_PERSISTENT; +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_failure_address (void* const tdb_ptr) +{ + struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr; + return tdb->atia; +} + +static __inline long __attribute__((__always_inline__, __nodebug__)) +__TM_failure_code (void* const tdb_ptr) +{ + struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr; + + return tdb->abort_code; +} + +#endif /* __s390__ */ + +#endif /* __HTMXLINTRIN_H */ diff --git a/contrib/llvm/tools/clang/lib/Headers/immintrin.h b/contrib/llvm/tools/clang/lib/Headers/immintrin.h index 2400fea..ac7d54a 100644 --- a/contrib/llvm/tools/clang/lib/Headers/immintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/immintrin.h @@ -88,10 +88,18 @@ #include <avx512bwintrin.h> #endif +#ifdef __AVX512DQ__ +#include <avx512dqintrin.h> +#endif + #if defined (__AVX512VL__) && defined (__AVX512BW__) #include <avx512vlbwintrin.h> #endif +#if defined (__AVX512VL__) && defined (__AVX512DQ__) +#include <avx512vldqintrin.h> +#endif + #ifdef __AVX512ER__ #include <avx512erintrin.h> #endif diff --git a/contrib/llvm/tools/clang/lib/Headers/module.modulemap b/contrib/llvm/tools/clang/lib/Headers/module.modulemap index 062464e..ac5876f 100644 --- a/contrib/llvm/tools/clang/lib/Headers/module.modulemap +++ b/contrib/llvm/tools/clang/lib/Headers/module.modulemap @@ -49,7 +49,7 @@ module _Builtin_intrinsics [system] [extern_c] { explicit module sse { requires sse export mmx - export * // note: for hackish <emmintrin.h> dependency + export sse2 // note: for hackish <emmintrin.h> dependency header "xmmintrin.h" } @@ -169,6 +169,19 @@ module _Builtin_intrinsics [system] [extern_c] { header "__wmmintrin_pclmul.h" } } + + explicit module systemz { + requires systemz + export * + + header "s390intrin.h" + + explicit module htm { + requires htm + header "htmintrin.h" + header "htmxlintrin.h" + } + } } module _Builtin_stddef_max_align_t [system] [extern_c] { diff --git a/contrib/llvm/tools/clang/lib/Headers/s390intrin.h b/contrib/llvm/tools/clang/lib/Headers/s390intrin.h new file mode 100644 index 0000000..b209895 --- /dev/null +++ b/contrib/llvm/tools/clang/lib/Headers/s390intrin.h @@ -0,0 +1,35 @@ +/*===---- s390intrin.h - SystemZ intrinsics --------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __S390INTRIN_H +#define __S390INTRIN_H + +#ifndef __s390__ +#error "<s390intrin.h> is for s390 only" +#endif + +#ifdef __HTM__ +#include <htmintrin.h> +#endif + +#endif /* __S390INTRIN_H*/ diff --git a/contrib/llvm/tools/clang/lib/Headers/stdatomic.h b/contrib/llvm/tools/clang/lib/Headers/stdatomic.h index e3c3476..e037987 100644 --- a/contrib/llvm/tools/clang/lib/Headers/stdatomic.h +++ b/contrib/llvm/tools/clang/lib/Headers/stdatomic.h @@ -71,7 +71,7 @@ typedef enum memory_order { /* 7.17.4 Fences */ -// These should be provided by the libc implementation. +/* These should be provided by the libc implementation. */ void atomic_thread_fence(memory_order); void atomic_signal_fence(memory_order); @@ -164,7 +164,7 @@ typedef struct atomic_flag { atomic_bool _Value; } atomic_flag; #define ATOMIC_FLAG_INIT { 0 } -// These should be provided by the libc implementation. +/* These should be provided by the libc implementation. */ #ifdef __cplusplus bool atomic_flag_test_and_set(volatile atomic_flag *); bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order); diff --git a/contrib/llvm/tools/clang/lib/Headers/unwind.h b/contrib/llvm/tools/clang/lib/Headers/unwind.h index 90aca16..303d792 100644 --- a/contrib/llvm/tools/clang/lib/Headers/unwind.h +++ b/contrib/llvm/tools/clang/lib/Headers/unwind.h @@ -235,9 +235,9 @@ void *_Unwind_FindEnclosingFunction(void *); #ifdef __APPLE__ _Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *) - __attribute__((unavailable)); + __attribute__((__unavailable__)); _Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *) - __attribute__((unavailable)); + __attribute__((__unavailable__)); /* Darwin-specific functions */ void __register_frame(const void *); @@ -251,15 +251,15 @@ struct dwarf_eh_bases { void *_Unwind_Find_FDE(const void *, struct dwarf_eh_bases *); void __register_frame_info_bases(const void *, void *, void *, void *) - __attribute__((unavailable)); -void __register_frame_info(const void *, void *) __attribute__((unavailable)); + __attribute__((__unavailable__)); +void __register_frame_info(const void *, void *) __attribute__((__unavailable__)); void __register_frame_info_table_bases(const void *, void*, void *, void *) - __attribute__((unavailable)); + __attribute__((__unavailable__)); void __register_frame_info_table(const void *, void *) - __attribute__((unavailable)); -void __register_frame_table(const void *) __attribute__((unavailable)); -void __deregister_frame_info(const void *) __attribute__((unavailable)); -void __deregister_frame_info_bases(const void *)__attribute__((unavailable)); + __attribute__((__unavailable__)); +void __register_frame_table(const void *) __attribute__((__unavailable__)); +void __deregister_frame_info(const void *) __attribute__((__unavailable__)); +void __deregister_frame_info_bases(const void *)__attribute__((__unavailable__)); #else diff --git a/contrib/llvm/tools/clang/lib/Headers/xmmintrin.h b/contrib/llvm/tools/clang/lib/Headers/xmmintrin.h index d1afe81..3a6b95e 100644 --- a/contrib/llvm/tools/clang/lib/Headers/xmmintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/xmmintrin.h @@ -994,7 +994,7 @@ do { \ #define _m_ _mm_ /* Ugly hack for backwards-compatibility (compatible with gcc) */ -#ifdef __SSE2__ +#if defined(__SSE2__) && !__has_feature(modules) #include <emmintrin.h> #endif |