diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Headers/avxintrin.h')
-rw-r--r-- | contrib/llvm/tools/clang/lib/Headers/avxintrin.h | 546 |
1 files changed, 407 insertions, 139 deletions
diff --git a/contrib/llvm/tools/clang/lib/Headers/avxintrin.h b/contrib/llvm/tools/clang/lib/Headers/avxintrin.h index be03ba3..dff5897 100644 --- a/contrib/llvm/tools/clang/lib/Headers/avxintrin.h +++ b/contrib/llvm/tools/clang/lib/Headers/avxintrin.h @@ -1458,12 +1458,13 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Computes two dot products in parallel, using the lower and upper /// halves of two [8 x float] vectors as input to the two computations, and /// returning the two dot products in the lower and upper halves of the -/// [8 x float] result. The immediate integer operand controls which input -/// elements will contribute to the dot product, and where the final results -/// are returned. In general, for each dot product, the four corresponding -/// elements of the input vectors are multiplied; the first two and second -/// two products are summed, then the two sums are added to form the final -/// result. +/// [8 x float] result. +/// +/// The immediate integer operand controls which input elements will +/// contribute to the dot product, and where the final results are returned. +/// In general, for each dot product, the four corresponding elements of the +/// input vectors are multiplied; the first two and second two products are +/// summed, then the two sums are added to form the final result. /// /// \headerfile <x86intrin.h> /// @@ -1497,15 +1498,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /* Vector shuffle */ /// \brief Selects 8 float values from the 256-bit operands of [8 x float], as -/// specified by the immediate value operand. The four selected elements in -/// each operand are copied to the destination according to the bits -/// specified in the immediate operand. The selected elements from the first -/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the -/// destination, and the selected elements from the second 256-bit operand -/// are copied to bits [127:64] and bits [255:192] of the destination. For -/// example, if bits [7:0] of the immediate operand contain a value of 0xFF, -/// the 256-bit destination vector would contain the following values: b[7], -/// b[7], a[7], a[7], b[3], b[3], a[3], a[3]. +/// specified by the immediate value operand. +/// +/// The four selected elements in each operand are copied to the destination +/// according to the bits specified in the immediate operand. The selected +/// elements from the first 256-bit operand are copied to bits [63:0] and +/// bits [191:128] of the destination, and the selected elements from the +/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of +/// the destination. For example, if bits [7:0] of the immediate operand +/// contain a value of 0xFF, the 256-bit destination vector would contain the +/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. /// /// \headerfile <x86intrin.h> /// @@ -1557,13 +1559,14 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 12 + (((mask) >> 6) & 0x3)); }) /// \brief Selects four double-precision values from the 256-bit operands of -/// [4 x double], as specified by the immediate value operand. The selected -/// elements from the first 256-bit operand are copied to bits [63:0] and -/// bits [191:128] in the destination, and the selected elements from the -/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in -/// the destination. For example, if bits [3:0] of the immediate operand -/// contain a value of 0xF, the 256-bit destination vector would contain the -/// following values: b[3], a[3], b[1], a[1]. +/// [4 x double], as specified by the immediate value operand. +/// +/// The selected elements from the first 256-bit operand are copied to bits +/// [63:0] and bits [191:128] in the destination, and the selected elements +/// from the second 256-bit operand are copied to bits [127:64] and bits +/// [255:192] in the destination. For example, if bits [3:0] of the immediate +/// operand contain a value of 0xF, the 256-bit destination vector would +/// contain the following values: b[3], a[3], b[1], a[1]. /// /// \headerfile <x86intrin.h> /// @@ -1613,9 +1616,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ -#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ +#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ -#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ +#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ @@ -1628,10 +1631,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ -#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ +#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ -#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ +#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ @@ -1641,9 +1644,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding double-precision values of two /// 128-bit vectors of [2 x double], using the operation specified by the -/// immediate integer operand. Returns a [2 x double] vector consisting of -/// two doubles corresponding to the two comparison results: zero if the -/// comparison is false, and all 1's if the comparison is true. +/// immediate integer operand. +/// +/// Returns a [2 x double] vector consisting of two doubles corresponding to +/// the two comparison results: zero if the comparison is false, and all 1's +/// if the comparison is true. /// /// \headerfile <x86intrin.h> /// @@ -1660,17 +1665,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_pd(a, b, c) __extension__ ({ \ (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ @@ -1678,9 +1704,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding values of two 128-bit vectors of /// [4 x float], using the operation specified by the immediate integer -/// operand. Returns a [4 x float] vector consisting of four floats -/// corresponding to the four comparison results: zero if the comparison is -/// false, and all 1's if the comparison is true. +/// operand. +/// +/// Returns a [4 x float] vector consisting of four floats corresponding to +/// the four comparison results: zero if the comparison is false, and all 1's +/// if the comparison is true. /// /// \headerfile <x86intrin.h> /// @@ -1697,17 +1725,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ps(a, b, c) __extension__ ({ \ (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ @@ -1715,9 +1764,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding double-precision values of two /// 256-bit vectors of [4 x double], using the operation specified by the -/// immediate integer operand. Returns a [4 x double] vector consisting of -/// four doubles corresponding to the four comparison results: zero if the -/// comparison is false, and all 1's if the comparison is true. +/// immediate integer operand. +/// +/// Returns a [4 x double] vector consisting of four doubles corresponding to +/// the four comparison results: zero if the comparison is false, and all 1's +/// if the comparison is true. /// /// \headerfile <x86intrin.h> /// @@ -1734,17 +1785,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 256-bit vector of [4 x double] containing the comparison results. #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ @@ -1752,9 +1824,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding values of two 256-bit vectors of /// [8 x float], using the operation specified by the immediate integer -/// operand. Returns a [8 x float] vector consisting of eight floats -/// corresponding to the eight comparison results: zero if the comparison is -/// false, and all 1's if the comparison is true. +/// operand. +/// +/// Returns a [8 x float] vector consisting of eight floats corresponding to +/// the eight comparison results: zero if the comparison is false, and all +/// 1's if the comparison is true. /// /// \headerfile <x86intrin.h> /// @@ -1771,17 +1845,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 256-bit vector of [8 x float] containing the comparison results. #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ @@ -1789,8 +1884,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding scalar double-precision values of /// two 128-bit vectors of [2 x double], using the operation specified by the -/// immediate integer operand. If the result is true, all 64 bits of the -/// destination vector are set; otherwise they are cleared. +/// immediate integer operand. +/// +/// If the result is true, all 64 bits of the destination vector are set; +/// otherwise they are cleared. /// /// \headerfile <x86intrin.h> /// @@ -1807,17 +1904,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_sd(a, b, c) __extension__ ({ \ (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ @@ -1825,8 +1943,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \brief Compares each of the corresponding scalar values of two 128-bit /// vectors of [4 x float], using the operation specified by the immediate -/// integer operand. If the result is true, all 32 bits of the destination -/// vector are set; otherwise they are cleared. +/// integer operand. +/// +/// If the result is true, all 32 bits of the destination vector are set; +/// otherwise they are cleared. /// /// \headerfile <x86intrin.h> /// @@ -1843,17 +1963,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 00h, 08h, 10h, 18h: Equal \n -/// 01h, 09h, 11h, 19h: Less than \n -/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal -/// (swapped operands) \n -/// 03h, 0Bh, 13h, 1Bh: Unordered \n -/// 04h, 0Ch, 14h, 1Ch: Not equal \n -/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than -/// (swapped operands) \n -/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal -/// (swapped operands) \n -/// 07h, 0Fh, 17h, 1Fh: Ordered +/// 0x00 : Equal (ordered, non-signaling) +/// 0x01 : Less-than (ordered, signaling) +/// 0x02 : Less-than-or-equal (ordered, signaling) +/// 0x03 : Unordered (non-signaling) +/// 0x04 : Not-equal (unordered, non-signaling) +/// 0x05 : Not-less-than (unordered, signaling) +/// 0x06 : Not-less-than-or-equal (unordered, signaling) +/// 0x07 : Ordered (non-signaling) +/// 0x08 : Equal (unordered, non-signaling) +/// 0x09 : Not-greater-than-or-equal (unordered, signaling) +/// 0x0a : Not-greater-than (unordered, signaling) +/// 0x0b : False (ordered, non-signaling) +/// 0x0c : Not-equal (ordered, non-signaling) +/// 0x0d : Greater-than-or-equal (ordered, signaling) +/// 0x0e : Greater-than (ordered, signaling) +/// 0x0f : True (unordered, non-signaling) +/// 0x10 : Equal (ordered, signaling) +/// 0x11 : Less-than (ordered, non-signaling) +/// 0x12 : Less-than-or-equal (ordered, non-signaling) +/// 0x13 : Unordered (signaling) +/// 0x14 : Not-equal (unordered, signaling) +/// 0x15 : Not-less-than (unordered, non-signaling) +/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) +/// 0x17 : Ordered (signaling) +/// 0x18 : Equal (unordered, signaling) +/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) +/// 0x1a : Not-greater-than (unordered, non-signaling) +/// 0x1b : False (ordered, signaling) +/// 0x1c : Not-equal (ordered, signaling) +/// 0x1d : Greater-than-or-equal (ordered, non-signaling) +/// 0x1e : Greater-than (ordered, non-signaling) +/// 0x1f : True (unordered, signaling) /// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ss(a, b, c) __extension__ ({ \ (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ @@ -2184,12 +2325,32 @@ _mm256_cvttps_epi32(__m256 __a) return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); } +/// \brief Returns the first element of the input vector of [4 x double]. +/// +/// \headerfile <avxintrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 64 bit double containing the first element of the input vector. static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a) { return __a[0]; } +/// \brief Returns the first element of the input vector of [8 x i32]. +/// +/// \headerfile <avxintrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \returns A 32 bit integer containing the first element of the input vector. static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a) { @@ -2197,6 +2358,16 @@ _mm256_cvtsi256_si32(__m256i __a) return __b[0]; } +/// \brief Returns the first element of the input vector of [8 x float]. +/// +/// \headerfile <avxintrin.h> +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 32 bit float containing the first element of the input vector. static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a) { @@ -2380,7 +2551,9 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b) /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an /// element-by-element comparison of the double-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2407,7 +2580,9 @@ _mm_testz_pd(__m128d __a, __m128d __b) /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an /// element-by-element comparison of the double-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2434,7 +2609,9 @@ _mm_testc_pd(__m128d __a, __m128d __b) /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an /// element-by-element comparison of the double-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2462,7 +2639,9 @@ _mm_testnzc_pd(__m128d __a, __m128d __b) /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2489,7 +2668,9 @@ _mm_testz_ps(__m128 __a, __m128 __b) /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2516,7 +2697,9 @@ _mm_testc_ps(__m128 __a, __m128 __b) /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2544,7 +2727,9 @@ _mm_testnzc_ps(__m128 __a, __m128 __b) /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an /// element-by-element comparison of the double-precision elements in the /// first source vector and the corresponding elements in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2571,7 +2756,9 @@ _mm256_testz_pd(__m256d __a, __m256d __b) /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an /// element-by-element comparison of the double-precision elements in the /// first source vector and the corresponding elements in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2598,7 +2785,9 @@ _mm256_testc_pd(__m256d __a, __m256d __b) /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an /// element-by-element comparison of the double-precision elements in the /// first source vector and the corresponding elements in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of double-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2626,7 +2815,9 @@ _mm256_testnzc_pd(__m256d __a, __m256d __b) /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2653,7 +2844,9 @@ _mm256_testz_ps(__m256 __a, __m256 __b) /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2680,7 +2873,9 @@ _mm256_testc_ps(__m256 __a, __m256 __b) /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an /// element-by-element comparison of the single-precision elements in the /// first source vector and the corresponding elements in the second source -/// vector. The EFLAGS register is updated as follows: \n +/// vector. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of single-precision elements where the /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the /// ZF flag is set to 1. \n @@ -2706,7 +2901,9 @@ _mm256_testnzc_ps(__m256 __a, __m256 __b) } /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison -/// of the two source vectors and update the EFLAGS register as follows: \n +/// of the two source vectors. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of bits where both bits are 1, the ZF flag /// is set to 0. Otherwise the ZF flag is set to 1. \n /// If there is at least one pair of bits where the bit from the first source @@ -2730,7 +2927,9 @@ _mm256_testz_si256(__m256i __a, __m256i __b) } /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison -/// of the two source vectors and update the EFLAGS register as follows: \n +/// of the two source vectors. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of bits where both bits are 1, the ZF flag /// is set to 0. Otherwise the ZF flag is set to 1. \n /// If there is at least one pair of bits where the bit from the first source @@ -2754,7 +2953,9 @@ _mm256_testc_si256(__m256i __a, __m256i __b) } /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison -/// of the two source vectors and update the EFLAGS register as follows: \n +/// of the two source vectors. +/// +/// The EFLAGS register is updated as follows: \n /// If there is at least one pair of bits where both bits are 1, the ZF flag /// is set to 0. Otherwise the ZF flag is set to 1. \n /// If there is at least one pair of bits where the bit from the first source @@ -3389,7 +3590,8 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(__m256i *__a, __m256i __b) { - __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); + typedef __v4di __v4di_aligned __attribute__((aligned(32))); + __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); } /// \brief Moves double-precision values from a 256-bit vector of [4 x double] @@ -3402,13 +3604,14 @@ _mm256_stream_si256(__m256i *__a, __m256i __b) /// /// \param __a /// A pointer to a 32-byte aligned memory location that will receive the -/// integer values. +/// double-precision floating-point values. /// \param __b /// A 256-bit vector of [4 x double] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(double *__a, __m256d __b) { - __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); + typedef __v4df __v4df_aligned __attribute__((aligned(32))); + __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); } /// \brief Moves single-precision floating point values from a 256-bit vector @@ -3428,7 +3631,8 @@ _mm256_stream_pd(double *__a, __m256d __b) static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(float *__p, __m256 __a) { - __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p); + typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); + __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); } /* Create vectors */ @@ -4310,9 +4514,10 @@ _mm256_castsi256_si128(__m256i __a) } /// \brief Constructs a 256-bit floating-point vector of [4 x double] from a -/// 128-bit floating-point vector of [2 x double]. The lower 128 bits -/// contain the value of the source vector. The contents of the upper 128 -/// bits are undefined. +/// 128-bit floating-point vector of [2 x double]. +/// +/// The lower 128 bits contain the value of the source vector. The contents +/// of the upper 128 bits are undefined. /// /// \headerfile <x86intrin.h> /// @@ -4330,9 +4535,10 @@ _mm256_castpd128_pd256(__m128d __a) } /// \brief Constructs a 256-bit floating-point vector of [8 x float] from a -/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain -/// the value of the source vector. The contents of the upper 128 bits are -/// undefined. +/// 128-bit floating-point vector of [4 x float]. +/// +/// The lower 128 bits contain the value of the source vector. The contents +/// of the upper 128 bits are undefined. /// /// \headerfile <x86intrin.h> /// @@ -4350,6 +4556,7 @@ _mm256_castps128_ps256(__m128 __a) } /// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. +/// /// The lower 128 bits contain the value of the source vector. The contents /// of the upper 128 bits are undefined. /// @@ -4367,6 +4574,61 @@ _mm256_castsi128_si256(__m128i __a) return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); } +/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a +/// 128-bit floating-point vector of [2 x double]. The lower 128 bits +/// contain the value of the source vector. The upper 128 bits are set +/// to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits +/// contain the value of the parameter. The upper 128 bits are set to zero. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_zextpd128_pd256(__m128d __a) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); +} + +/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a +/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain +/// the value of the source vector. The upper 128 bits are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits +/// contain the value of the parameter. The upper 128 bits are set to zero. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_zextps128_ps256(__m128 __a) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); +} + +/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. +/// The lower 128 bits contain the value of the source vector. The upper +/// 128 bits are set to zero. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 256-bit integer vector. The lower 128 bits contain the value of +/// the parameter. The upper 128 bits are set to zero. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_zextsi128_si256(__m128i __a) +{ + return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); +} + /* Vector insert. We use macros rather than inlines because we only want to accept @@ -4375,8 +4637,10 @@ _mm256_castsi128_si256(__m128i __a) /// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating /// a 256-bit vector of [8 x float] given in the first parameter, and then /// replacing either the upper or the lower 128 bits with the contents of a -/// 128-bit vector of [4 x float] in the second parameter. The immediate -/// integer parameter determines between the upper or the lower 128 bits. +/// 128-bit vector of [4 x float] in the second parameter. +/// +/// The immediate integer parameter determines between the upper or the lower +/// 128 bits. /// /// \headerfile <x86intrin.h> /// @@ -4420,8 +4684,10 @@ _mm256_castsi128_si256(__m128i __a) /// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating /// a 256-bit vector of [4 x double] given in the first parameter, and then /// replacing either the upper or the lower 128 bits with the contents of a -/// 128-bit vector of [2 x double] in the second parameter. The immediate -/// integer parameter determines between the upper or the lower 128 bits. +/// 128-bit vector of [2 x double] in the second parameter. +/// +/// The immediate integer parameter determines between the upper or the lower +/// 128 bits. /// /// \headerfile <x86intrin.h> /// @@ -4461,8 +4727,10 @@ _mm256_castsi128_si256(__m128i __a) /// \brief Constructs a new 256-bit integer vector by first duplicating a /// 256-bit integer vector given in the first parameter, and then replacing /// either the upper or the lower 128 bits with the contents of a 128-bit -/// integer vector in the second parameter. The immediate integer parameter -/// determines between the upper or the lower 128 bits. +/// integer vector in the second parameter. +/// +/// The immediate integer parameter determines between the upper or the lower +/// 128 bits. /// /// \headerfile <x86intrin.h> /// |