diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Headers/avxintrin.h')
-rw-r--r-- | contrib/llvm/tools/clang/lib/Headers/avxintrin.h | 186 |
1 files changed, 98 insertions, 88 deletions
diff --git a/contrib/llvm/tools/clang/lib/Headers/avxintrin.h b/contrib/llvm/tools/clang/lib/Headers/avxintrin.h index c1bc85b..6d1ca54 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avxintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avxintrin.h @@ -35,12 +35,16 @@ typedef int __v8si __attribute__ ((__vector_size__ (32))); typedef short __v16hi __attribute__ ((__vector_size__ (32))); typedef char __v32qi __attribute__ ((__vector_size__ (32))); +/* We need an explicitly signed variant for char. Note that this shouldn't + * appear in the interface though. */ +typedef signed char __v32qs __attribute__((__vector_size__(32))); + typedef float __m256 __attribute__ ((__vector_size__ (32))); typedef double __m256d __attribute__((__vector_size__(32))); typedef long long __m256i __attribute__((__vector_size__(32))); /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) /* Arithmetic */ static __inline __m256d __DEFAULT_FN_ATTRS @@ -152,12 +156,10 @@ _mm256_rcp_ps(__m256 __a) } #define _mm256_round_pd(V, M) __extension__ ({ \ - __m256d __V = (V); \ - (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); }) + (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) #define _mm256_round_ps(V, M) __extension__ ({ \ - __m256 __V = (V); \ - (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); }) + (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) @@ -264,26 +266,26 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) } #define _mm_permute_pd(A, C) __extension__ ({ \ - __m128d __A = (A); \ - (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \ + (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ + (__v2df)_mm_setzero_pd(), \ (C) & 0x1, ((C) & 0x2) >> 1); }) #define _mm256_permute_pd(A, C) __extension__ ({ \ - __m256d __A = (A); \ - (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \ + (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ + (__v4df)_mm256_setzero_pd(), \ (C) & 0x1, ((C) & 0x2) >> 1, \ 2 + (((C) & 0x4) >> 2), \ 2 + (((C) & 0x8) >> 3)); }) #define _mm_permute_ps(A, C) __extension__ ({ \ - __m128 __A = (A); \ - (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ + (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ + (__v4sf)_mm_setzero_ps(), \ (C) & 0x3, ((C) & 0xc) >> 2, \ ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) #define _mm256_permute_ps(A, C) __extension__ ({ \ - __m256 __A = (A); \ - (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \ + (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ + (__v8sf)_mm256_setzero_ps(), \ (C) & 0x3, ((C) & 0xc) >> 2, \ ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \ 4 + (((C) & 0x03) >> 0), \ @@ -292,34 +294,29 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) 4 + (((C) & 0xc0) >> 6)); }) #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ - __m256d __V1 = (V1); \ - __m256d __V2 = (V2); \ - (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); }) + (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ + (__v4df)(__m256d)(V2), (M)); }) #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ - __m256 __V1 = (V1); \ - __m256 __V2 = (V2); \ - (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) + (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (M)); }) #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ - __m256i __V1 = (V1); \ - __m256i __V2 = (V2); \ - (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); }) + (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ + (__v8si)(__m256i)(V2), (M)); }) /* Vector Blend */ #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ - __m256d __V1 = (V1); \ - __m256d __V2 = (V2); \ - (__m256d)__builtin_shufflevector((__v4df)__V1, (__v4df)__V2, \ + (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ + (__v4df)(__m256d)(V2), \ (((M) & 0x01) ? 4 : 0), \ (((M) & 0x02) ? 5 : 1), \ (((M) & 0x04) ? 6 : 2), \ (((M) & 0x08) ? 7 : 3)); }) #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ - __m256 __V1 = (V1); \ - __m256 __V2 = (V2); \ - (__m256)__builtin_shufflevector((__v8sf)__V1, (__v8sf)__V2, \ + (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), \ (((M) & 0x01) ? 8 : 0), \ (((M) & 0x02) ? 9 : 1), \ (((M) & 0x04) ? 10 : 2), \ @@ -345,28 +342,29 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /* Vector Dot Product */ #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ - __m256 __V1 = (V1); \ - __m256 __V2 = (V2); \ - (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) + (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (M)); }) /* Vector shuffle */ #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ - __m256 __a = (a); \ - __m256 __b = (b); \ - (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \ - (mask) & 0x3, ((mask) & 0xc) >> 2, \ - (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ - ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \ - (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); }) + (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), \ + (mask) & 0x3, \ + ((mask) & 0xc) >> 2, \ + (((mask) & 0x30) >> 4) + 8, \ + (((mask) & 0xc0) >> 6) + 8, \ + ((mask) & 0x3) + 4, \ + (((mask) & 0xc) >> 2) + 4, \ + (((mask) & 0x30) >> 4) + 12, \ + (((mask) & 0xc0) >> 6) + 12); }) #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ - __m256d __a = (a); \ - __m256d __b = (b); \ - (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \ - (mask) & 0x1, \ - (((mask) & 0x2) >> 1) + 4, \ - (((mask) & 0x4) >> 2) + 2, \ - (((mask) & 0x8) >> 3) + 6); }) + (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), \ + (mask) & 0x1, \ + (((mask) & 0x2) >> 1) + 4, \ + (((mask) & 0x4) >> 2) + 2, \ + (((mask) & 0x8) >> 3) + 6); }) /* Compare */ #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ @@ -403,34 +401,28 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ #define _mm_cmp_pd(a, b, c) __extension__ ({ \ - __m128d __a = (a); \ - __m128d __b = (b); \ - (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); }) + (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (c)); }) #define _mm_cmp_ps(a, b, c) __extension__ ({ \ - __m128 __a = (a); \ - __m128 __b = (b); \ - (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); }) + (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (c)); }) #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ - __m256d __a = (a); \ - __m256d __b = (b); \ - (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); }) + (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (c)); }) #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ - __m256 __a = (a); \ - __m256 __b = (b); \ - (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); }) + (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (c)); }) #define _mm_cmp_sd(a, b, c) __extension__ ({ \ - __m128d __a = (a); \ - __m128d __b = (b); \ - (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); }) + (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (c)); }) #define _mm_cmp_ss(a, b, c) __extension__ ({ \ - __m128 __a = (a); \ - __m128 __b = (b); \ - (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) + (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (c)); }) static __inline int __DEFAULT_FN_ATTRS _mm256_extract_epi32(__m256i __a, const int __imm) @@ -831,53 +823,53 @@ _mm256_storeu_si256(__m256i *__p, __m256i __a) /* Conditional load ops */ static __inline __m128d __DEFAULT_FN_ATTRS -_mm_maskload_pd(double const *__p, __m128d __m) +_mm_maskload_pd(double const *__p, __m128i __m) { - return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2df)__m); + return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); } static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_maskload_pd(double const *__p, __m256d __m) +_mm256_maskload_pd(double const *__p, __m256i __m) { return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, - (__v4df)__m); + (__v4di)__m); } static __inline __m128 __DEFAULT_FN_ATTRS -_mm_maskload_ps(float const *__p, __m128 __m) +_mm_maskload_ps(float const *__p, __m128i __m) { - return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4sf)__m); + return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); } static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_maskload_ps(float const *__p, __m256 __m) +_mm256_maskload_ps(float const *__p, __m256i __m) { - return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8sf)__m); + return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); } /* Conditional store ops */ static __inline void __DEFAULT_FN_ATTRS -_mm256_maskstore_ps(float *__p, __m256 __m, __m256 __a) +_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) { - __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8sf)__m, (__v8sf)__a); + __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); } static __inline void __DEFAULT_FN_ATTRS -_mm_maskstore_pd(double *__p, __m128d __m, __m128d __a) +_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) { - __builtin_ia32_maskstorepd((__v2df *)__p, (__v2df)__m, (__v2df)__a); + __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); } static __inline void __DEFAULT_FN_ATTRS -_mm256_maskstore_pd(double *__p, __m256d __m, __m256d __a) +_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) { - __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4df)__m, (__v4df)__a); + __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); } static __inline void __DEFAULT_FN_ATTRS -_mm_maskstore_ps(float *__p, __m128 __m, __m128 __a) +_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) { - __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4sf)__m, (__v4sf)__a); + __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); } /* Cacheability support ops */ @@ -900,6 +892,24 @@ _mm256_stream_ps(float *__p, __m256 __a) } /* Create vectors */ +static __inline__ __m256d __DEFAULT_FN_ATTRS +_mm256_undefined_pd() +{ + return (__m256d)__builtin_ia32_undef256(); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS +_mm256_undefined_ps() +{ + return (__m256)__builtin_ia32_undef256(); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_undefined_si256() +{ + return (__m256i)__builtin_ia32_undef256(); +} + static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a, double __b, double __c, double __d) { @@ -1140,14 +1150,14 @@ _mm256_castsi128_si256(__m128i __a) return __builtin_shufflevector(__a, __a, 0, 1, -1, -1); } -/* +/* Vector insert. We use macros rather than inlines because we only want to accept invocations where the immediate M is a constant expression. */ #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ (__m256)__builtin_shufflevector( \ - (__v8sf)(V1), \ + (__v8sf)(__m256)(V1), \ (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ (((M) & 1) ? 0 : 8), \ (((M) & 1) ? 1 : 9), \ @@ -1160,7 +1170,7 @@ _mm256_castsi128_si256(__m128i __a) #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ (__m256d)__builtin_shufflevector( \ - (__v4df)(V1), \ + (__v4df)(__m256d)(V1), \ (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ (((M) & 1) ? 0 : 4), \ (((M) & 1) ? 1 : 5), \ @@ -1169,21 +1179,21 @@ _mm256_castsi128_si256(__m128i __a) #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ (__m256i)__builtin_shufflevector( \ - (__v4di)(V1), \ + (__v4di)(__m256i)(V1), \ (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ (((M) & 1) ? 0 : 4), \ (((M) & 1) ? 1 : 5), \ (((M) & 1) ? 4 : 2), \ (((M) & 1) ? 5 : 3) );}) -/* +/* Vector extract. We use macros rather than inlines because we only want to accept invocations where the immediate M is a constant expression. */ #define _mm256_extractf128_ps(V, M) __extension__ ({ \ (__m128)__builtin_shufflevector( \ - (__v8sf)(V), \ + (__v8sf)(__m256)(V), \ (__v8sf)(_mm256_setzero_ps()), \ (((M) & 1) ? 4 : 0), \ (((M) & 1) ? 5 : 1), \ @@ -1192,14 +1202,14 @@ _mm256_castsi128_si256(__m128i __a) #define _mm256_extractf128_pd(V, M) __extension__ ({ \ (__m128d)__builtin_shufflevector( \ - (__v4df)(V), \ + (__v4df)(__m256d)(V), \ (__v4df)(_mm256_setzero_pd()), \ (((M) & 1) ? 2 : 0), \ (((M) & 1) ? 3 : 1) );}) #define _mm256_extractf128_si256(V, M) __extension__ ({ \ (__m128i)__builtin_shufflevector( \ - (__v4di)(V), \ + (__v4di)(__m256i)(V), \ (__v4di)(_mm256_setzero_si256()), \ (((M) & 1) ? 2 : 0), \ (((M) & 1) ? 3 : 1) );}) @@ -1222,7 +1232,7 @@ _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) struct __loadu_pd { __m128d __v; } __attribute__((__packed__, __may_alias__)); - + __m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v); return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1); } |