25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 28 #if __cplusplus >= 201703L 30 #if !_GLIBCXX_SIMD_X86INTRIN 32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available" 35 _GLIBCXX_SIMD_BEGIN_NAMESPACE
40 template <
typename _Tp,
size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 {
return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np
>>(__x._M_data); }
45 template <
typename _TV,
47 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
48 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
50 __to_masktype(_TV __x)
51 {
return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>
>(__x); }
55 template <
typename _Ap,
typename _Bp,
typename _Tp = common_type_t<_Ap, _Bp>,
56 typename _Trait = _VectorTraits<_Tp>>
57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
58 __interleave128_lo(
const _Ap& __av,
const _Bp& __bv)
62 if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
63 return _Tp{__a[0], __b[0]};
64 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
65 return _Tp{__a[0], __b[0], __a[1], __b[1]};
66 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
67 return _Tp{__a[0], __b[0], __a[1], __b[1],
68 __a[2], __b[2], __a[3], __b[3]};
69 else if constexpr (
sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
72 __a[6], __b[6], __a[7], __b[7]};
73 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
74 return _Tp{__a[0], __b[0], __a[2], __b[2]};
75 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
76 return _Tp{__a[0], __b[0], __a[1], __b[1],
77 __a[4], __b[4], __a[5], __b[5]};
78 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
81 __a[10], __b[10], __a[11], __b[11]};
82 else if constexpr (
sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
87 __a[22], __b[22], __a[23], __b[23]};
88 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
89 return _Tp{__a[0], __b[0], __a[2], __b[2],
90 __a[4], __b[4], __a[6], __b[6]};
91 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
94 __a[12], __b[12], __a[13], __b[13]};
95 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
100 __a[26], __b[26], __a[27], __b[27]};
101 else if constexpr (
sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
113 __assert_unreachable<_Tp>();
118 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
119 _GLIBCXX_SIMD_INTRINSIC constexpr
bool 122 if (!__builtin_is_constant_evaluated())
124 if constexpr (__have_avx)
126 if constexpr (_TVT::template _S_is<float, 8>)
127 return _mm256_testz_ps(__a, __a);
128 else if constexpr (_TVT::template _S_is<double, 4>)
129 return _mm256_testz_pd(__a, __a);
130 else if constexpr (
sizeof(_Tp) == 32)
131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
132 else if constexpr (_TVT::template _S_is<
float>)
133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
134 else if constexpr (_TVT::template _S_is<
double, 2>)
135 return _mm_testz_pd(__a, __a);
137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
139 else if constexpr (__have_sse4_1)
140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
141 __intrin_bitcast<__m128i>(__a));
143 else if constexpr (sizeof(_Tp) <= 8)
144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
147 const auto __b = __vector_bitcast<_LLong>(__a);
148 if constexpr (
sizeof(__b) == 16)
149 return (__b[0] | __b[1]) == 0;
150 else if constexpr (sizeof(__b) == 32)
151 return __is_zero(__lo128(__b) | __hi128(__b));
152 else if constexpr (sizeof(__b) == 64)
153 return __is_zero(__lo256(__b) | __hi256(__b));
155 __assert_unreachable<_Tp>();
161 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST
int 165 if constexpr (
sizeof(_Tp) == 32)
167 if constexpr (_TVT::template _S_is<float>)
168 return _mm256_movemask_ps(__to_intrin(__a));
169 else if constexpr (_TVT::template _S_is<double>)
170 return _mm256_movemask_pd(__to_intrin(__a));
172 return _mm256_movemask_epi8(__to_intrin(__a));
174 else if constexpr (_TVT::template _S_is<float>)
175 return _mm_movemask_ps(__to_intrin(__a));
176 else if constexpr (_TVT::template _S_is<double>)
177 return _mm_movemask_pd(__to_intrin(__a));
179 return _mm_movemask_epi8(__to_intrin(__a));
184 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 186 __testz(_TI __a, _TI __b)
188 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
189 _TVT::_S_full_size>>);
190 if (!__builtin_is_constant_evaluated())
192 if constexpr (
sizeof(_TI) == 32)
194 if constexpr (_TVT::template _S_is<float>)
195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
196 else if constexpr (_TVT::template _S_is<double>)
197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
201 else if constexpr (_TVT::template _S_is<float> && __have_avx)
202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
203 else if constexpr (_TVT::template _S_is<double> && __have_avx)
204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
205 else if constexpr (__have_sse4_1)
206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
207 __intrin_bitcast<__m128i>(__to_intrin(__b)));
209 return __movemask(0 == __and(__a, __b)) != 0;
212 return __is_zero(__and(__a, __b));
218 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 220 __testc(_TI __a, _TI __b)
222 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
223 _TVT::_S_full_size>>);
224 if (__builtin_is_constant_evaluated())
225 return __is_zero(__andnot(__a, __b));
227 if constexpr (
sizeof(_TI) == 32)
229 if constexpr (_TVT::template _S_is<float>)
230 return _mm256_testc_ps(__a, __b);
231 else if constexpr (_TVT::template _S_is<double>)
232 return _mm256_testc_pd(__a, __b);
234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
236 else if constexpr (_TVT::template _S_is<float> && __have_avx)
237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
238 else if constexpr (_TVT::template _S_is<double> && __have_avx)
239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
244 __intrin_bitcast<__m128i>(__to_intrin(__b)));
250 template <
typename _TI,
typename _TVT = _VectorTraits<_TI>>
251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
int 252 __testnzc(_TI __a, _TI __b)
254 static_assert(is_same_v<_TI, __intrinsic_type_t<
typename _TVT::value_type,
255 _TVT::_S_full_size>>);
256 if (!__builtin_is_constant_evaluated())
258 if constexpr (
sizeof(_TI) == 32)
260 if constexpr (_TVT::template _S_is<float>)
261 return _mm256_testnzc_ps(__a, __b);
262 else if constexpr (_TVT::template _S_is<double>)
263 return _mm256_testnzc_pd(__a, __b);
265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
267 else if constexpr (_TVT::template _S_is<float> && __have_avx)
268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
269 else if constexpr (_TVT::template _S_is<double> && __have_avx)
270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
271 else if constexpr (__have_sse4_1)
272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
273 __intrin_bitcast<__m128i>(__to_intrin(__b)));
275 return __movemask(0 == __and(__a, __b)) == 0
276 && __movemask(0 == __andnot(__a, __b)) == 0;
279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
286 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
287 _GLIBCXX_SIMD_INTRINSIC _Tp
290 if constexpr (
sizeof(_Tp) == 16)
293 is_floating_point_v<typename _TVT::value_type>, float,
int>>(__a);
294 return reinterpret_cast<_Tp
>(
295 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
297 else if constexpr (
sizeof(_Tp) == 32)
300 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
301 return reinterpret_cast<_Tp
>(
302 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
304 else if constexpr (
sizeof(_Tp) == 64)
307 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
308 return reinterpret_cast<_Tp
>(decltype(__x){__x[0], __x[1], __x[4],
309 __x[5], __x[2], __x[3],
313 __assert_unreachable<_Tp>();
318 template <
typename _Tp>
319 _GLIBCXX_SIMD_INTRINSIC
auto 320 __maskload_epi32(
const int* __ptr, _Tp __k)
322 if constexpr (
sizeof(__k) == 16)
323 return _mm_maskload_epi32(__ptr, __k);
325 return _mm256_maskload_epi32(__ptr, __k);
330 template <typename _Tp>
331 _GLIBCXX_SIMD_INTRINSIC auto
332 __maskload_epi64(const _LLong* __ptr, _Tp __k)
334 if constexpr (
sizeof(__k) == 16)
335 return _mm_maskload_epi64(__ptr, __k);
337 return _mm256_maskload_epi64(__ptr, __k);
342 template <typename _Tp>
343 _GLIBCXX_SIMD_INTRINSIC auto
344 __maskload_ps(const
float* __ptr, _Tp __k)
346 if constexpr (
sizeof(__k) == 16)
347 return _mm_maskload_ps(__ptr, __k);
349 return _mm256_maskload_ps(__ptr, __k);
354 template <typename _Tp>
355 _GLIBCXX_SIMD_INTRINSIC auto
356 __maskload_pd(const
double* __ptr, _Tp __k)
358 if constexpr (
sizeof(__k) == 16)
359 return _mm_maskload_pd(__ptr, __k);
361 return _mm256_maskload_pd(__ptr, __k);
367 template <
size_t _Np,
typename _Tp,
typename _Kp>
368 _GLIBCXX_SIMD_INTRINSIC constexpr
auto 369 __movm(_Kp __k) noexcept
371 static_assert(is_unsigned_v<_Kp>);
372 if constexpr (
sizeof(_Tp) == 1 && __have_avx512bw)
374 if constexpr (_Np <= 16 && __have_avx512vl)
375 return __builtin_ia32_cvtmask2b128(__k);
376 else if constexpr (_Np <= 32 && __have_avx512vl)
377 return __builtin_ia32_cvtmask2b256(__k);
379 return __builtin_ia32_cvtmask2b512(__k);
381 else if constexpr (
sizeof(_Tp) == 2 && __have_avx512bw)
383 if constexpr (_Np <= 8 && __have_avx512vl)
384 return __builtin_ia32_cvtmask2w128(__k);
385 else if constexpr (_Np <= 16 && __have_avx512vl)
386 return __builtin_ia32_cvtmask2w256(__k);
388 return __builtin_ia32_cvtmask2w512(__k);
390 else if constexpr (
sizeof(_Tp) == 4 && __have_avx512dq)
392 if constexpr (_Np <= 4 && __have_avx512vl)
393 return __builtin_ia32_cvtmask2d128(__k);
394 else if constexpr (_Np <= 8 && __have_avx512vl)
395 return __builtin_ia32_cvtmask2d256(__k);
397 return __builtin_ia32_cvtmask2d512(__k);
399 else if constexpr (
sizeof(_Tp) == 8 && __have_avx512dq)
401 if constexpr (_Np <= 2 && __have_avx512vl)
402 return __builtin_ia32_cvtmask2q128(__k);
403 else if constexpr (_Np <= 4 && __have_avx512vl)
404 return __builtin_ia32_cvtmask2q256(__k);
406 return __builtin_ia32_cvtmask2q512(__k);
409 __assert_unreachable<_Tp>();
413 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 414 #include "simd_x86_conversions.h" 418 template <
typename _Tp,
size_t _Np>
424 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
427 template <
typename _Tp,
size_t _Np>
433 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
436 template <
typename _Tp,
size_t _Np>
442 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
445 template <
typename _Tp,
size_t _Np>
451 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
454 template <
typename _Tp,
size_t _Np>
458 return __have_avx512f
460 float> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
463 template <
typename _Tp,
size_t _Np>
467 return __have_avx512f
469 double> &&
sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
473 struct _MaskImplX86Mixin;
476 struct _CommonImplX86 : _CommonImplBuiltin
478 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 480 template <
typename _From,
typename _To,
size_t _ToSize>
481 static constexpr
bool 482 _S_converts_via_decomposition()
484 if constexpr (is_integral_v<
485 _From> && is_integral_v<_To> &&
sizeof(_From) == 8
487 return (sizeof(_To) == 2 && !__have_ssse3)
488 || (sizeof(_To) == 1 && !__have_avx512f);
489 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
490 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
492 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
495 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
497 return (sizeof(_To) == 4 && _ToSize == 16)
498 || (sizeof(_To) == 8 && _ToSize < 64);
503 template <typename _From, typename _To,
size_t _ToSize>
504 static inline constexpr
bool __converts_via_decomposition_v
505 = _S_converts_via_decomposition<_From, _To, _ToSize>();
510 using _CommonImplBuiltin::_S_store;
512 template <
typename _Tp,
size_t _Np>
513 _GLIBCXX_SIMD_INTRINSIC
static constexpr
void 514 _S_store(_SimdWrapper<_Tp, _Np> __x,
void* __addr)
516 constexpr
size_t _Bytes = _Np *
sizeof(_Tp);
518 if (__builtin_is_constant_evaluated())
519 _CommonImplBuiltin::_S_store(__x, __addr);
520 else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
522 const auto __v = __to_intrin(__x);
524 if constexpr (_Bytes & 1)
526 if constexpr (_Bytes < 16)
527 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
528 __intrin_bitcast<__m128i>(__v));
529 else if constexpr (_Bytes < 32)
530 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
531 __intrin_bitcast<__m256i>(__v));
533 _mm512_mask_storeu_epi8(__addr,
534 0xffffffffffffffffull >> (64 - _Bytes),
535 __intrin_bitcast<__m512i>(__v));
537 else if constexpr (_Bytes & 2)
539 if constexpr (_Bytes < 16)
540 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
541 __intrin_bitcast<__m128i>(__v));
542 else if constexpr (_Bytes < 32)
543 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
544 __intrin_bitcast<__m256i>(__v));
546 _mm512_mask_storeu_epi16(__addr,
547 0xffffffffull >> (32 - _Bytes / 2),
548 __intrin_bitcast<__m512i>(__v));
550 else if constexpr (_Bytes & 4)
552 if constexpr (_Bytes < 16)
553 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
554 __intrin_bitcast<__m128i>(__v));
555 else if constexpr (_Bytes < 32)
556 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
557 __intrin_bitcast<__m256i>(__v));
559 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
560 __intrin_bitcast<__m512i>(__v));
566 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes " 567 "- 1)) != 0 is impossible");
568 if constexpr (_Bytes < 32)
569 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
570 __intrin_bitcast<__m256i>(__v));
572 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
573 __intrin_bitcast<__m512i>(__v));
577 _CommonImplBuiltin::_S_store(__x, __addr);
582 template <
size_t _Np,
bool _Sanitized>
583 _GLIBCXX_SIMD_INTRINSIC static constexpr
void 584 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x,
bool* __mem)
586 if (__builtin_is_constant_evaluated())
587 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
588 else if constexpr (__have_avx512bw_vl)
589 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
590 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
591 if constexpr (_Np <= 16)
592 return _mm_movm_epi8(__x._M_to_bits());
593 else if constexpr (_Np <= 32)
594 return _mm256_movm_epi8(__x._M_to_bits());
595 else if constexpr (_Np <= 64)
596 return _mm512_movm_epi8(__x._M_to_bits());
598 __assert_unreachable<_SizeConstant<_Np>>();
601 else if constexpr (__have_bmi2)
603 if constexpr (_Np <= 4)
604 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
606 __execute_n_times<__div_roundup(_Np, sizeof(
size_t))>(
607 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
608 constexpr
size_t __offset = __i *
sizeof(size_t);
609 constexpr
int __todo =
std::min(
sizeof(
size_t), _Np - __offset);
610 if constexpr (__todo == 1)
611 __mem[__offset] = __x[__offset];
616 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
617 0x0101010101010101ULL);
620 __x.template _M_extract<__offset>()._M_to_bits(),
623 _S_store<__todo>(__bools, __mem + __offset);
627 else if constexpr (__have_sse2 && _Np > 7)
628 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
629 constexpr
int __offset = __i * 16;
630 constexpr
int __todo =
std::min(16,
int(_Np) - __offset);
631 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
632 __vector_type16_t<_UChar> __bools;
633 if constexpr (__have_avx512f)
636 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
637 __vector_broadcast<16>(1)));
639 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
640 __todo > 8 ? __hi256(__as32bits)
642 __bools = __vector_bitcast<_UChar>(
643 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
647 using _V = __vector_type_t<_UChar, 16>;
648 auto __tmp = _mm_cvtsi32_si128(__bits);
649 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
650 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
651 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
652 _V __tmp2 =
reinterpret_cast<_V
>(__tmp);
653 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
654 1, 2, 4, 8, 16, 32, 64, 128};
655 __bools = (__tmp2 == 0) + 1;
657 _S_store<__todo>(__bools, __mem + __offset);
660 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
669 template <
typename _Kp,
typename _TV>
670 _GLIBCXX_SIMD_INTRINSIC
static _TV
671 _S_blend_avx512(
const _Kp __k,
const _TV __a,
const _TV __b) noexcept
673 static_assert(__is_vector_type_v<_TV>);
674 using _Tp =
typename _VectorTraits<_TV>::value_type;
675 static_assert(
sizeof(_TV) >= 16);
676 static_assert(
sizeof(_Tp) <= 8);
678 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a;
681 = conditional_t<(sizeof(_Tp) > 2),
683 conditional_t<sizeof(_Tp) == 1, char, short>>;
684 [[maybe_unused]]
const auto __aa = __vector_bitcast<_IntT>(__a);
685 [[maybe_unused]]
const auto __bb = __vector_bitcast<_IntT>(__b);
686 if constexpr (
sizeof(_TV) == 64)
688 if constexpr (
sizeof(_Tp) == 1)
689 return reinterpret_cast<_TV>(
690 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
691 else if constexpr (sizeof(_Tp) == 2)
692 return reinterpret_cast<_TV>(
693 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
694 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
695 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
696 else if constexpr (sizeof(_Tp) == 4)
697 return reinterpret_cast<_TV>(
698 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
699 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
700 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
701 else if constexpr (sizeof(_Tp) == 8)
702 return reinterpret_cast<_TV>(
703 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
705 else if constexpr (sizeof(_TV) == 32)
707 if constexpr (
sizeof(_Tp) == 1)
708 return reinterpret_cast<_TV>(
709 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
710 else if constexpr (sizeof(_Tp) == 2)
711 return reinterpret_cast<_TV>(
712 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
713 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
714 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
715 else if constexpr (sizeof(_Tp) == 4)
716 return reinterpret_cast<_TV>(
717 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
718 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
719 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
720 else if constexpr (sizeof(_Tp) == 8)
721 return reinterpret_cast<_TV>(
722 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
724 else if constexpr (sizeof(_TV) == 16)
726 if constexpr (
sizeof(_Tp) == 1)
727 return reinterpret_cast<_TV>(
728 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
729 else if constexpr (sizeof(_Tp) == 2)
730 return reinterpret_cast<_TV>(
731 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
732 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
733 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
734 else if constexpr (sizeof(_Tp) == 4)
735 return reinterpret_cast<_TV>(
736 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
737 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
738 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
739 else if constexpr (sizeof(_Tp) == 8)
740 return reinterpret_cast<_TV>(
741 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
752 template <
typename _Tp>
753 _GLIBCXX_SIMD_INTRINSIC
static _Tp
754 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept
756 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
759 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
760 __m128 __k)
const noexcept
762 return __builtin_ia32_blendvps(__a, __b, __k);
764 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
765 __m128d __k)
const noexcept
767 return __builtin_ia32_blendvpd(__a, __b, __k);
769 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
770 __m128i __k)
const noexcept
772 return reinterpret_cast<__m128i
>(
773 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
774 reinterpret_cast<__v16qi>(__b),
775 reinterpret_cast<__v16qi>(__k)));
777 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
778 __m256 __k)
const noexcept
780 return __builtin_ia32_blendvps256(__a, __b, __k);
782 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
783 __m256d __k)
const noexcept
785 return __builtin_ia32_blendvpd256(__a, __b, __k);
787 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
788 __m256i __k)
const noexcept
790 if constexpr (__have_avx2)
791 return reinterpret_cast<__m256i
>(
792 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
793 reinterpret_cast<__v32qi>(__b),
794 reinterpret_cast<__v32qi>(__k)));
796 return reinterpret_cast<__m256i
>(
797 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
798 reinterpret_cast<__v8sf>(__b),
799 reinterpret_cast<__v8sf>(__k)));
802 return __eval(__a, __b, __k);
809 template <
typename _Tp,
size_t _Np>
810 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
811 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
812 _SimdWrapper<_Tp, _Np> __at1)
814 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
815 if (__k._M_is_constprop() && __at0._M_is_constprop()
816 && __at1._M_is_constprop())
817 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
818 [&](
auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
819 return __k[__i] ? __at1[__i] : __at0[__i];
821 else if constexpr (
sizeof(__at0) == 64
822 || (__have_avx512vl &&
sizeof(__at0) >= 16))
823 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
826 static_assert((__have_avx512vl &&
sizeof(__at0) < 16)
827 || !__have_avx512vl);
828 constexpr
size_t __size = (__have_avx512vl ? 16 : 64) /
sizeof(_Tp);
829 return __vector_bitcast<_Tp, _Np>(
830 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
831 __vector_bitcast<_Tp, __size>(__at1)));
835 template <
typename _Tp,
size_t _Np>
836 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
837 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
838 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
840 const auto __kk = __wrapper_bitcast<_Tp>(__k);
841 if (__builtin_is_constant_evaluated()
842 || (__kk._M_is_constprop() && __at0._M_is_constprop()
843 && __at1._M_is_constprop()))
845 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
846 if (__r._M_is_constprop())
849 if constexpr (((__have_avx512f &&
sizeof(__at0) == 64) || __have_avx512vl)
850 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
853 _SimdWrapper<bool, _Np>(
854 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
862 if constexpr (__have_sse4_1)
863 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
866 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
875 template <
typename _Abi,
typename>
876 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
878 using _Base = _SimdImplBuiltin<_Abi>;
880 template <
typename _Tp>
881 using _MaskMember =
typename _Base::template _MaskMember<_Tp>;
883 template <
typename _Tp>
884 static constexpr
size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
886 template <
typename _Tp>
887 static constexpr
size_t _S_size = _Abi::template _S_size<_Tp>;
889 template <
typename _Tp>
890 static constexpr
size_t _S_max_store_size
891 = (
sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
892 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
895 using _MaskImpl =
typename _Abi::_MaskImpl;
898 template <
typename _Tp,
size_t _Np,
typename _Up>
899 static inline _SimdWrapper<_Tp, _Np>
900 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
901 const _Up* __mem) noexcept
903 static_assert(_Np == _S_size<_Tp>);
904 if constexpr (is_same_v<_Tp, _Up> ||
905 (
sizeof(_Tp) ==
sizeof(_Up)
907 _Tp> == is_integral_v<_Up>)
911 [[maybe_unused]]
const auto __intrin = __to_intrin(__merge);
912 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
915 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
916 if constexpr (
sizeof(__intrin) == 16)
917 __merge = __vector_bitcast<_Tp, _Np>(
918 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
919 else if constexpr (sizeof(__merge) == 32)
920 __merge = __vector_bitcast<_Tp, _Np>(
921 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
922 else if constexpr (sizeof(__merge) == 64)
923 __merge = __vector_bitcast<_Tp, _Np>(
924 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
926 __assert_unreachable<_Tp>();
928 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
931 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
932 if constexpr (
sizeof(__intrin) == 16)
933 __merge = __vector_bitcast<_Tp, _Np>(
934 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
935 else if constexpr (sizeof(__intrin) == 32)
936 __merge = __vector_bitcast<_Tp, _Np>(
937 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
938 else if constexpr (sizeof(__intrin) == 64)
939 __merge = __vector_bitcast<_Tp, _Np>(
940 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
942 __assert_unreachable<_Tp>();
944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
945 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
948 if constexpr (
sizeof(__intrin) == 16)
949 __merge = __vector_bitcast<_Tp, _Np>(
950 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
951 else if constexpr (sizeof(__intrin) == 32)
952 __merge = __vector_bitcast<_Tp, _Np>(
953 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
954 else if constexpr (sizeof(__intrin) == 64)
955 __merge = __vector_bitcast<_Tp, _Np>(
956 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
958 __assert_unreachable<_Tp>();
960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
961 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
964 if constexpr (
sizeof(__intrin) == 16)
965 __merge = __vector_bitcast<_Tp, _Np>(
966 _mm_mask_loadu_ps(__intrin, __kk, __mem));
967 else if constexpr (sizeof(__intrin) == 32)
968 __merge = __vector_bitcast<_Tp, _Np>(
969 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
970 else if constexpr (sizeof(__intrin) == 64)
971 __merge = __vector_bitcast<_Tp, _Np>(
972 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
974 __assert_unreachable<_Tp>();
976 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
977 && is_integral_v<_Up>)
979 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
982 __vector_bitcast<_Tp, _Np>(
983 __maskload_epi32(reinterpret_cast<const int*>(__mem),
986 else if constexpr (__have_avx &&
sizeof(_Tp) == 4)
988 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
991 __vector_bitcast<_Tp, _Np>(
992 __maskload_ps(reinterpret_cast<const float*>(__mem),
995 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
996 &&
sizeof(_Tp) == 8 && is_integral_v<_Up>)
998 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
999 if constexpr (
sizeof(__intrin) == 16)
1000 __merge = __vector_bitcast<_Tp, _Np>(
1001 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
1002 else if constexpr (sizeof(__intrin) == 32)
1003 __merge = __vector_bitcast<_Tp, _Np>(
1004 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
1005 else if constexpr (sizeof(__intrin) == 64)
1006 __merge = __vector_bitcast<_Tp, _Np>(
1007 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
1009 __assert_unreachable<_Tp>();
1011 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
1012 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
1014 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1015 if constexpr (
sizeof(__intrin) == 16)
1016 __merge = __vector_bitcast<_Tp, _Np>(
1017 _mm_mask_loadu_pd(__intrin, __kk, __mem));
1018 else if constexpr (sizeof(__intrin) == 32)
1019 __merge = __vector_bitcast<_Tp, _Np>(
1020 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
1021 else if constexpr (sizeof(__intrin) == 64)
1022 __merge = __vector_bitcast<_Tp, _Np>(
1023 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
1025 __assert_unreachable<_Tp>();
1027 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1028 && is_integral_v<_Up>)
1030 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
1032 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1033 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
1034 reinterpret_cast<const _LLong*>(__mem),
1035 __to_intrin(__k))));
1037 else if constexpr (__have_avx &&
sizeof(_Tp) == 8)
1039 static_assert(
sizeof(__intrin) == 16 ||
sizeof(__intrin) == 32);
1041 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
1042 __vector_bitcast<_Tp, _Np>(
1043 __maskload_pd(reinterpret_cast<const double*>(__mem),
1044 __to_intrin(__k))));
1047 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1048 [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1049 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1077 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1083 template <
typename _Tp,
size_t _Np>
1084 _GLIBCXX_SIMD_INTRINSIC
static void 1085 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper<bool, _Np> __k)
1087 [[maybe_unused]]
const auto __vi = __to_intrin(__v);
1088 if constexpr (
sizeof(__vi) == 64)
1090 static_assert(
sizeof(__v) == 64 && __have_avx512f);
1091 if constexpr (__have_avx512bw &&
sizeof(_Tp) == 1)
1092 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1093 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1094 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1095 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1097 if constexpr (is_integral_v<_Tp>)
1098 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1100 _mm512_mask_storeu_ps(__mem, __k, __vi);
1102 else if constexpr (__have_avx512f &&
sizeof(_Tp) == 8)
1104 if constexpr (is_integral_v<_Tp>)
1105 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1107 _mm512_mask_storeu_pd(__mem, __k, __vi);
1109 #if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32 1111 else if constexpr (__have_sse2)
1113 using _M = __vector_type_t<_Tp, _Np>;
1114 using _MVT = _VectorTraits<_M>;
1115 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)),
1116 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)),
1117 reinterpret_cast<char*>(__mem));
1118 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)),
1119 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1120 __k._M_data >> 1 * _MVT::_S_full_size)),
1121 reinterpret_cast<char*>(__mem) + 1 * 16);
1122 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)),
1123 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1124 __k._M_data >> 2 * _MVT::_S_full_size)),
1125 reinterpret_cast<char*>(__mem) + 2 * 16);
1126 if constexpr (_Np > 48 /
sizeof(_Tp))
1127 _mm_maskmoveu_si128(
1128 __auto_bitcast(__extract<3, 4>(__v._M_data)),
1129 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1130 __k._M_data >> 3 * _MVT::_S_full_size)),
1131 reinterpret_cast<char*>(__mem) + 3 * 16);
1135 __assert_unreachable<_Tp>();
1137 else if constexpr (
sizeof(__vi) == 32)
1139 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1140 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1141 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1142 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1143 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1145 if constexpr (is_integral_v<_Tp>)
1146 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1148 _mm256_mask_storeu_ps(__mem, __k, __vi);
1150 else if constexpr (__have_avx512vl &&
sizeof(_Tp) == 8)
1152 if constexpr (is_integral_v<_Tp>)
1153 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1155 _mm256_mask_storeu_pd(__mem, __k, __vi);
1157 else if constexpr (__have_avx512f
1158 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
1161 _S_masked_store_nocvt(
1162 _SimdWrapper64<_Tp>(
1163 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1164 __mem, _SimdWrapper<
bool, 64 /
sizeof(_Tp)>(__k._M_data));
1167 _S_masked_store_nocvt(__v, __mem,
1168 _MaskImpl::template _S_to_maskvector<
1169 __int_for_sizeof_t<_Tp>, _Np>(__k));
1171 else if constexpr (
sizeof(__vi) == 16)
1173 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1174 _mm_mask_storeu_epi8(__mem, __k, __vi);
1175 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1176 _mm_mask_storeu_epi16(__mem, __k, __vi);
1177 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1179 if constexpr (is_integral_v<_Tp>)
1180 _mm_mask_storeu_epi32(__mem, __k, __vi);
1182 _mm_mask_storeu_ps(__mem, __k, __vi);
1184 else if constexpr (__have_avx512vl &&
sizeof(_Tp) == 8)
1186 if constexpr (is_integral_v<_Tp>)
1187 _mm_mask_storeu_epi64(__mem, __k, __vi);
1189 _mm_mask_storeu_pd(__mem, __k, __vi);
1191 else if constexpr (__have_avx512f
1192 && (
sizeof(_Tp) >= 4 || __have_avx512bw))
1195 _S_masked_store_nocvt(
1196 _SimdWrapper64<_Tp>(
1197 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1198 __mem, _SimdWrapper<
bool, 64 /
sizeof(_Tp)>(__k._M_data));
1201 _S_masked_store_nocvt(__v, __mem,
1202 _MaskImpl::template _S_to_maskvector<
1203 __int_for_sizeof_t<_Tp>, _Np>(__k));
1206 __assert_unreachable<_Tp>();
1209 template <
typename _Tp,
size_t _Np>
1210 _GLIBCXX_SIMD_INTRINSIC
static void 1211 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1212 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1214 if constexpr (
sizeof(__v) <= 16)
1216 [[maybe_unused]]
const auto __vi
1217 = __intrin_bitcast<__m128i>(__as_vector(__v));
1218 [[maybe_unused]]
const auto __ki
1219 = __intrin_bitcast<__m128i>(__as_vector(__k));
1220 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1221 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1222 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1223 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1224 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1225 && is_integral_v<_Tp>)
1226 _mm_maskstore_epi32(reinterpret_cast<
int*>(__mem), __ki, __vi);
1227 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1228 _mm_maskstore_ps(reinterpret_cast<
float*>(__mem), __ki,
1229 __vector_bitcast<
float>(__vi));
1230 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1231 && is_integral_v<_Tp>)
1232 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1233 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1234 _mm_maskstore_pd(reinterpret_cast<
double*>(__mem), __ki,
1235 __vector_bitcast<
double>(__vi));
1236 else if constexpr (__have_sse2)
1237 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<
char*>(__mem));
1239 else if constexpr (sizeof(__v) == 32)
1241 [[maybe_unused]]
const auto __vi
1242 = __intrin_bitcast<__m256i>(__as_vector(__v));
1243 [[maybe_unused]]
const auto __ki
1244 = __intrin_bitcast<__m256i>(__as_vector(__k));
1245 if constexpr (__have_avx512bw_vl &&
sizeof(_Tp) == 1)
1246 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1247 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1248 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1249 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1250 && is_integral_v<_Tp>)
1251 _mm256_maskstore_epi32(reinterpret_cast<
int*>(__mem), __ki, __vi);
1252 else if constexpr (sizeof(_Tp) == 4)
1253 _mm256_maskstore_ps(reinterpret_cast<
float*>(__mem), __ki,
1254 __vector_bitcast<
float>(__v));
1255 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1256 && is_integral_v<_Tp>)
1257 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1259 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1260 _mm256_maskstore_pd(reinterpret_cast<
double*>(__mem), __ki,
1261 __vector_bitcast<
double>(__v));
1262 else if constexpr (__have_sse2)
1264 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki),
1265 reinterpret_cast<char*>(__mem));
1266 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki),
1267 reinterpret_cast<char*>(__mem) + 16);
1271 __assert_unreachable<_Tp>();
1276 template <
typename _Tp,
size_t _Np,
typename _Up>
1277 _GLIBCXX_SIMD_INTRINSIC
static void 1278 _S_masked_store(
const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1279 const _MaskMember<_Tp> __k) noexcept
1281 if constexpr (is_integral_v<
1282 _Tp> && is_integral_v<_Up> &&
sizeof(_Tp) >
sizeof(_Up)
1283 && __have_avx512f && (
sizeof(_Tp) >= 4 || __have_avx512bw)
1284 && (
sizeof(__v) == 64 || __have_avx512vl))
1286 const auto __vi = __to_intrin(__v);
1287 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1288 if constexpr (
sizeof(_Tp) == 8 &&
sizeof(_Up) == 4
1289 &&
sizeof(__vi) == 64)
1290 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1291 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1292 && sizeof(__vi) == 32)
1293 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1294 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1295 && sizeof(__vi) == 16)
1296 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1297 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1298 && sizeof(__vi) == 64)
1299 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1300 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1301 && sizeof(__vi) == 32)
1302 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1303 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1304 && sizeof(__vi) == 16)
1305 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1306 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1307 && sizeof(__vi) == 64)
1308 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1309 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1310 && sizeof(__vi) == 32)
1311 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1312 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1313 && sizeof(__vi) == 16)
1314 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1315 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1316 && sizeof(__vi) == 64)
1317 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1318 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1319 && sizeof(__vi) == 32)
1320 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1321 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1322 && sizeof(__vi) == 16)
1323 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1324 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1325 && sizeof(__vi) == 64)
1326 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1327 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1328 && sizeof(__vi) == 32)
1329 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1330 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1331 && sizeof(__vi) == 16)
1332 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1333 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1334 && sizeof(__vi) == 64)
1335 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1336 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1337 && sizeof(__vi) == 32)
1338 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1339 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1340 && sizeof(__vi) == 16)
1341 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1343 __assert_unreachable<_Tp>();
1346 _Base::_S_masked_store(__v, __mem, __k);
1351 template <typename _V, typename _VVT = _VectorTraits<_V>>
1352 _GLIBCXX_SIMD_INTRINSIC static constexpr _V
1353 _S_multiplies(_V __x, _V __y)
1355 using _Tp =
typename _VVT::value_type;
1356 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1357 || __y._M_is_constprop())
1358 return __as_vector(__x) * __as_vector(__y);
1359 else if constexpr (
sizeof(_Tp) == 1)
1361 if constexpr (
sizeof(_V) == 2)
1363 const auto __xs =
reinterpret_cast<short>(__x._M_data);
1364 const auto __ys =
reinterpret_cast<short>(__y._M_data);
1365 return reinterpret_cast<__vector_type_t<_Tp, 2>
>(short(
1366 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1368 else if constexpr (
sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1370 const auto __xi =
reinterpret_cast<int>(__x._M_data);
1371 const auto __yi =
reinterpret_cast<int>(__y._M_data);
1372 return reinterpret_cast<__vector_type_t<_Tp, 3>
>(
1373 ((__xi * __yi) & 0xff)
1374 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1375 | ((__xi >> 16) * (__yi & 0xff0000)));
1377 else if constexpr (
sizeof(_V) == 4)
1379 const auto __xi =
reinterpret_cast<int>(__x._M_data);
1380 const auto __yi =
reinterpret_cast<int>(__y._M_data);
1381 return reinterpret_cast<__vector_type_t<_Tp, 4>
>(
1382 ((__xi * __yi) & 0xff)
1383 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1384 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1385 | ((__xi >> 24) * (__yi & 0xff000000u)));
1387 else if constexpr (
sizeof(_V) == 8 && __have_avx2
1388 && is_signed_v<_Tp>)
1389 return __convert<typename _VVT::type>(
1390 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1391 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1392 else if constexpr (
sizeof(_V) == 8 && __have_avx2
1393 && is_unsigned_v<_Tp>)
1394 return __convert<typename _VVT::type>(
1395 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1396 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1400 constexpr
size_t __full_size = _VVT::_S_full_size;
1401 constexpr
int _Np =
sizeof(_V) >= 16 ? __full_size / 2 : 8;
1402 using _ShortW = _SimdWrapper<short, _Np>;
1403 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1404 * __vector_bitcast<short, _Np>(__y);
1405 _ShortW __high_byte = _ShortW()._M_data - 256;
1408 = (__vector_bitcast<short, _Np>(__x) >> 8)
1409 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1410 if constexpr (__have_avx512bw &&
sizeof(_V) > 2)
1411 return _CommonImplX86::_S_blend_avx512(
1412 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1413 __vector_bitcast<_Tp>(__odd));
1414 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1415 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1417 __to_intrin(__even),
1418 __to_intrin(__odd));
1421 __or(__andnot(__high_byte, __even), __odd));
1425 return _Base::_S_multiplies(__x, __y);
1430 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993 1431 template <
typename _Tp,
size_t _Np>
1432 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SimdWrapper<_Tp, _Np>
1433 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1435 if (!__builtin_is_constant_evaluated()
1436 && !__builtin_constant_p(__y._M_data))
1437 if constexpr (is_integral_v<_Tp> &&
sizeof(_Tp) <= 4)
1456 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1457 constexpr
size_t __n_intermediate
1458 =
std::min(_Np, (__have_avx512f ? 64
1462 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1463 constexpr
size_t __n_floatv
1464 = __div_roundup(_Np, __n_intermediate);
1465 using _R = __vector_type_t<_Tp, _Np>;
1466 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1467 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1468 _Abi::__make_padding_nonzero(__as_vector(__y)));
1469 return __call_with_n_evaluations<__n_floatv>(
1470 [](
auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1471 return __vector_convert<_R>(__quotients...);
1473 [&__xf, &__yf](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1474 -> _SimdWrapper<_Float, __n_intermediate>
1476 #if __RECIPROCAL_MATH__ 1482 if constexpr (__have_avx)
1486 if constexpr (
sizeof(_Tp) == 4)
1487 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}
" 1489 : "x
"(__xf[__i]), "x
"(__yf[__i])); 1491 asm("vdivps\t{%2, %1, %0|%0, %1, %2}
" 1493 : "x
"(__xf[__i]), "x
"(__yf[__i])); 1498 if constexpr (sizeof(_Tp) == 4) 1499 asm("divpd\t{%1, %0|%0, %1}
" 1503 asm("divps\t{%1, %0|%0, %1}
" 1509 return __xf[__i] / __yf[__i]; 1513 /* 64-bit int division is potentially optimizable via double division if 1514 * the value in __x is small enough and the conversion between 1515 * int<->double is efficient enough: 1516 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> && 1519 if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1521 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull, 1522 0xffe0'0000'0000'0000ull})) 1524 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data) 1529 return _Base::_S_divides(__x, __y); 1532 using _Base::_S_divides; 1533 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993 1537 template <typename _Tp, size_t _Np> 1538 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1539 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1541 if (__builtin_is_constant_evaluated() 1542 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8) 1543 return _Base::_S_modulus(__x, __y); 1545 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y))); 1549 // _S_bit_shift_left {{{ 1550 // Notes on UB. C++2a [expr.shift] says: 1551 // -1- [...] The operands shall be of integral or unscoped enumeration type 1552 // and integral promotions are performed. The type of the result is that 1553 // of the promoted left operand. The behavior is undefined if the right 1554 // operand is negative, or greater than or equal to the width of the 1555 // promoted left operand. 1556 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo 1557 // 2^N, where N is the width of the type of the result. 1559 // C++17 [expr.shift] says: 1560 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated 1561 // bits are zero-filled. If E1 has an unsigned type, the value of the 1562 // result is E1 × 2^E2 , reduced modulo one more than the maximum value 1563 // representable in the result type. Otherwise, if E1 has a signed type 1564 // and non-negative value, and E1 × 2^E2 is representable in the 1565 // corresponding unsigned type of the result type, then that value, 1566 // converted to the result type, is the resulting value; otherwise, the 1567 // behavior is undefined. 1570 // With C++2a signed and unsigned types have the same UB 1572 // - left shift is not UB for 0 <= RHS < max(32, #bits(T)) 1574 // With C++17 there's little room for optimizations because the standard 1575 // requires all shifts to happen on promoted integrals (i.e. int). Thus, 1576 // short and char shifts must assume shifts affect bits of neighboring 1578 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1579 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1580 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1581 _S_bit_shift_left(_Tp __xx, int __y) 1583 using _V = typename _TVT::type; 1584 using _Up = typename _TVT::value_type; 1586 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1587 if (__builtin_is_constant_evaluated()) 1589 #if __cplusplus > 201703 1590 // after C++17, signed shifts have no UB, and behave just like unsigned 1592 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) 1593 return __vector_bitcast<_Up>( 1594 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1597 else if constexpr (sizeof(_Up) == 1) 1599 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894) 1600 if (__builtin_constant_p(__y)) 1611 else if (__y > 2 && __y < 8) 1613 if constexpr (sizeof(__x) > sizeof(unsigned)) 1615 const _UChar __mask = 0xff << __y; // precomputed vector 1616 return __vector_bitcast<_Up>( 1617 __vector_bitcast<_UChar>( 1618 __vector_bitcast<unsigned>(__x) << __y) 1623 const unsigned __mask 1624 = (0xff & (0xff << __y)) * 0x01010101u; 1625 return reinterpret_cast<_V>( 1626 static_cast<__int_for_sizeof_t<_V>>( 1628 reinterpret_cast<__int_for_sizeof_t<_V>>(__x) 1633 else if (__y >= 8 && __y < 32) 1636 __builtin_unreachable(); 1638 // general strategy in the following: use an sllv instead of sll 1639 // instruction, because it's 2 to 4 times faster: 1640 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16) 1641 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8( 1642 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix), 1643 _mm256_set1_epi16(__y)))); 1644 else if constexpr (__have_avx512bw && sizeof(__x) == 32) 1645 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1646 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix), 1647 _mm512_set1_epi16(__y)))); 1648 else if constexpr (__have_avx512bw && sizeof(__x) == 64) 1650 const auto __shift = _mm512_set1_epi16(__y); 1651 return __vector_bitcast<_Up>( 1652 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1653 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)), 1654 _mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1655 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift)))); 1657 else if constexpr (__have_avx2 && sizeof(__x) == 32) 1660 const auto __shift = _mm_cvtsi32_si128(__y); 1662 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift); 1663 __k |= _mm256_srli_epi16(__k, 8); 1664 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift) 1667 const _Up __k = 0xff << __y; 1668 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y) 1674 const auto __shift = _mm_cvtsi32_si128(__y); 1676 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift); 1677 __k |= _mm_srli_epi16(__k, 8); 1678 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k); 1684 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1685 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1686 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y) 1688 using _V = typename _TVT::type; 1689 using _Up = typename _TVT::value_type; 1691 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1692 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1693 if (__builtin_is_constant_evaluated()) 1695 #if __cplusplus > 201703 1696 // after C++17, signed shifts have no UB, and behave just like unsigned 1698 else if constexpr (is_signed_v<_Up>) 1699 return __vector_bitcast<_Up>( 1700 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x), 1701 __vector_bitcast<make_unsigned_t<_Up>>(__y))); 1703 else if constexpr (sizeof(_Up) == 1) 1705 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1706 return __vector_bitcast<_Up>(__concat( 1707 _mm512_cvtepi16_epi8( 1708 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)), 1709 _mm512_cvtepu8_epi16(__lo256(__iy)))), 1710 _mm512_cvtepi16_epi8( 1711 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)), 1712 _mm512_cvtepu8_epi16(__hi256(__iy)))))); 1713 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1714 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1715 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix), 1716 _mm512_cvtepu8_epi16(__iy)))); 1717 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl) 1718 return __intrin_bitcast<_V>( 1719 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix), 1720 _mm_cvtepu8_epi16(__iy)))); 1721 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1722 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1723 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix), 1724 _mm256_cvtepu8_epi16(__iy)))); 1725 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1726 return __intrin_bitcast<_V>( 1727 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1728 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)), 1729 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy)))))); 1730 else if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1733 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5); 1735 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1737 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1738 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4))); 1741 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1743 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1744 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2))); 1746 auto __x1 = __x + __x; 1747 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1748 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1))); 1750 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1752 else if constexpr (sizeof(__x) == 16) 1755 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5); 1757 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4); 1759 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x; 1762 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2); 1764 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x; 1766 auto __x1 = __x + __x; 1767 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x; 1769 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1774 else if constexpr (sizeof(_Up) == 2) 1776 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1777 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy)); 1778 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl) 1779 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy)); 1780 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1781 return __vector_bitcast<_Up>( 1782 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix), 1783 _mm512_castsi256_si512(__iy)))); 1784 else if constexpr (sizeof __ix == 32 && __have_avx2) 1786 const auto __ux = __vector_bitcast<unsigned>(__x); 1787 const auto __uy = __vector_bitcast<unsigned>(__y); 1788 return __vector_bitcast<_Up>(_mm256_blend_epi16( 1789 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1790 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1792 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1793 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy)); 1794 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1795 return __intrin_bitcast<_V>( 1796 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix), 1797 _mm512_castsi128_si512(__iy)))); 1798 else if constexpr (sizeof __ix == 16 && __have_avx2) 1800 const auto __ux = __vector_bitcast<unsigned>(__ix); 1801 const auto __uy = __vector_bitcast<unsigned>(__iy); 1802 return __intrin_bitcast<_V>(_mm_blend_epi16( 1803 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1804 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1806 else if constexpr (sizeof __ix == 16) 1808 using _Float4 = __vector_type_t<float, 4>; 1809 using _Int4 = __vector_type_t<int, 4>; 1810 using _UInt4 = __vector_type_t<unsigned, 4>; 1812 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3))); 1814 * __intrin_bitcast<_V>( 1815 __vector_convert<_Int4>(_SimdWrapper<float, 4>( 1816 reinterpret_cast<_Float4>(__yu << 23))) 1817 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>( 1818 reinterpret_cast<_Float4>((__yu >> 16) << 23))) 1822 __assert_unreachable<_Tp>(); 1824 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16 1826 // latency is suboptimal, but throughput is at full speedup 1827 return __intrin_bitcast<_V>( 1828 __vector_bitcast<unsigned>(__ix) 1829 * __vector_convert<__vector_type16_t<int>>( 1830 _SimdWrapper<float, 4>(__vector_bitcast<float>( 1831 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000)))); 1832 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16 1835 const auto __lo = _mm_sll_epi64(__ix, __iy); 1837 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy)); 1838 if constexpr (__have_sse4_1) 1839 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0)); 1841 return __vector_bitcast<_Up>( 1842 _mm_move_sd(__vector_bitcast<double>(__hi), 1843 __vector_bitcast<double>(__lo))); 1848 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 1851 // _S_bit_shift_right {{{ 1852 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1853 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1854 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1855 _S_bit_shift_right(_Tp __xx, int __y) 1857 using _V = typename _TVT::type; 1858 using _Up = typename _TVT::value_type; 1860 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1861 if (__builtin_is_constant_evaluated()) 1863 else if (__builtin_constant_p(__y) 1865 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__)) 1867 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{ 1868 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y) 1871 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{ 1872 return __intrin_bitcast<_V>( 1873 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix) 1876 | (__vector_bitcast<_UShort>( 1877 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8) 1881 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected 1882 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{ 1885 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32) 1886 & _Up(0xffff'ffff'0000'0000ull)) 1887 | __vector_bitcast<_Up>( 1888 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix) 1892 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix) 1894 | __vector_bitcast<_Up>( 1895 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll) 1903 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 1904 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1905 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y) 1907 using _V = typename _TVT::type; 1908 using _Up = typename _TVT::value_type; 1910 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1911 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1912 if (__builtin_is_constant_evaluated() 1913 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 1915 else if constexpr (sizeof(_Up) == 1) //{{{ 1917 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl) 1918 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8( 1919 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix), 1920 _mm_cvtepi8_epi16(__iy)) 1921 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix), 1922 _mm_cvtepu8_epi16(__iy)))); 1923 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl) 1924 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1926 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix), 1927 _mm256_cvtepi8_epi16(__iy)) 1928 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix), 1929 _mm256_cvtepu8_epi16(__iy)))); 1930 else if constexpr (sizeof(__x) == 32 && __have_avx512bw) 1931 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1933 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix), 1934 _mm512_cvtepi8_epi16(__iy)) 1935 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix), 1936 _mm512_cvtepu8_epi16(__iy)))); 1937 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>) 1938 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1939 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1940 0x5555'5555'5555'5555ull, 1942 _mm512_slli_epi16(__ix, 8), 1943 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy, 1944 _mm512_set1_epi16(8))))); 1945 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>) 1946 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1947 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1948 0x5555'5555'5555'5555ull, 1950 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix), 1951 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy)))); 1952 /* This has better throughput but higher latency than the impl below 1953 else if constexpr (__have_avx2 && sizeof(__x) == 16 && 1956 const auto __shorts = __to_intrin(_S_bit_shift_right( 1957 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)), 1958 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy)))); 1959 return __vector_bitcast<_Up>( 1960 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts))); 1963 else if constexpr (__have_avx2 && sizeof(__x) > 8) 1964 // the following uses vpsr[al]vd, which requires AVX2 1965 if constexpr (is_signed_v<_Up>) 1967 const auto r3 = __vector_bitcast<_UInt>( 1968 (__vector_bitcast<int>(__x) 1969 >> (__vector_bitcast<_UInt>(__y) >> 24))) 1972 = __vector_bitcast<_UInt>( 1973 ((__vector_bitcast<int>(__x) << 8) 1974 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))) 1977 = __vector_bitcast<_UInt>( 1978 ((__vector_bitcast<int>(__x) << 16) 1979 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))) 1981 const auto r0 = __vector_bitcast<_UInt>( 1982 (__vector_bitcast<int>(__x) << 24) 1983 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24)); 1984 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1989 const auto r3 = (__vector_bitcast<_UInt>(__x) 1990 >> (__vector_bitcast<_UInt>(__y) >> 24)) 1993 = ((__vector_bitcast<_UInt>(__x) << 8) 1994 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)) 1997 = ((__vector_bitcast<_UInt>(__x) << 16) 1998 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)) 2001 = (__vector_bitcast<_UInt>(__x) << 24) 2002 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24); 2003 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 2006 else if constexpr (__have_sse4_1 2007 && is_unsigned_v<_Up> && sizeof(__x) > 2) 2009 auto __x128 = __vector_bitcast<_Up>(__ix); 2011 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5); 2012 auto __x4 = __vector_bitcast<_Up>( 2013 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f)); 2014 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 2015 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4))); 2017 auto __x2 = __vector_bitcast<_Up>( 2018 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f)); 2019 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 2020 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2))); 2022 auto __x1 = __vector_bitcast<_Up>( 2023 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f)); 2024 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 2025 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1))); 2026 return __intrin_bitcast<_V>( 2028 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2029 == 0)); // y > 7 nulls the result 2031 else if constexpr (__have_sse4_1 2032 && is_signed_v<_Up> && sizeof(__x) > 2) 2034 auto __mask = __vector_bitcast<_UChar>( 2035 __vector_bitcast<_UShort>(__iy) << 5); 2036 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2037 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); 2039 auto __xh = __vector_bitcast<short>(__ix); 2040 auto __xl = __vector_bitcast<short>(__ix) << 8; 2041 auto __xh4 = __xh >> 4; 2042 auto __xl4 = __xl >> 4; 2043 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2044 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4))); 2045 __xl = __vector_bitcast<short>( 2046 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2047 __to_intrin(__xl4))); 2049 auto __xh2 = __xh >> 2; 2050 auto __xl2 = __xl >> 2; 2051 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2052 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2))); 2053 __xl = __vector_bitcast<short>( 2054 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2055 __to_intrin(__xl2))); 2057 auto __xh1 = __xh >> 1; 2058 auto __xl1 = __xl >> 1; 2059 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin( 2060 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1))); 2061 __xl = __vector_bitcast<short>( 2062 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2063 __to_intrin(__xl1))); 2064 return __intrin_bitcast<_V>( 2065 (__vector_bitcast<_Up>((__xh & short(0xff00))) 2066 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2068 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2069 == 0)); // y > 7 nulls the result 2071 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2 2074 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5); 2075 auto __x4 = __vector_bitcast<_Up>( 2076 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f)); 2077 __x = __mask > 0x7f ? __x4 : __x; 2079 auto __x2 = __vector_bitcast<_Up>( 2080 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f)); 2081 __x = __mask > 0x7f ? __x2 : __x; 2083 auto __x1 = __vector_bitcast<_Up>( 2084 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f)); 2085 __x = __mask > 0x7f ? __x1 : __x; 2087 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2089 else if constexpr (sizeof(__x) > 2) // signed SSE2 2091 static_assert(is_signed_v<_Up>); 2092 auto __maskh = __vector_bitcast<_UShort>(__y) << 5; 2093 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8); 2094 auto __xh = __vector_bitcast<short>(__x); 2095 auto __xl = __vector_bitcast<short>(__x) << 8; 2096 auto __xh4 = __xh >> 4; 2097 auto __xl4 = __xl >> 4; 2098 __xh = __maskh > 0x7fff ? __xh4 : __xh; 2099 __xl = __maskl > 0x7fff ? __xl4 : __xl; 2102 auto __xh2 = __xh >> 2; 2103 auto __xl2 = __xl >> 2; 2104 __xh = __maskh > 0x7fff ? __xh2 : __xh; 2105 __xl = __maskl > 0x7fff ? __xl2 : __xl; 2108 auto __xh1 = __xh >> 1; 2109 auto __xl1 = __xl >> 1; 2110 __xh = __maskh > 0x7fff ? __xh1 : __xh; 2111 __xl = __maskl > 0x7fff ? __xl1 : __xl; 2112 __x = __vector_bitcast<_Up>((__xh & short(0xff00))) 2113 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2116 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2121 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ 2123 [[maybe_unused]] auto __blend_0xaa 2124 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2125 if constexpr (sizeof(__a) == 16) 2126 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2128 else if constexpr (sizeof(__a) == 32) 2129 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2131 else if constexpr (sizeof(__a) == 64) 2132 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), 2135 __assert_unreachable<decltype(__a)>(); 2137 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) 2138 return __intrin_bitcast<_V>(is_signed_v<_Up> 2139 ? _mm_srav_epi16(__ix, __iy) 2140 : _mm_srlv_epi16(__ix, __iy)); 2141 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32) 2142 return __vector_bitcast<_Up>(is_signed_v<_Up> 2143 ? _mm256_srav_epi16(__ix, __iy) 2144 : _mm256_srlv_epi16(__ix, __iy)); 2145 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64) 2146 return __vector_bitcast<_Up>(is_signed_v<_Up> 2147 ? _mm512_srav_epi16(__ix, __iy) 2148 : _mm512_srlv_epi16(__ix, __iy)); 2149 else if constexpr (__have_avx2 && is_signed_v<_Up>) 2150 return __intrin_bitcast<_V>( 2151 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16) 2152 >> (__vector_bitcast<int>(__iy) & 0xffffu)) 2154 __vector_bitcast<int>(__ix) 2155 >> (__vector_bitcast<int>(__iy) >> 16))); 2156 else if constexpr (__have_avx2 && is_unsigned_v<_Up>) 2157 return __intrin_bitcast<_V>( 2158 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu) 2159 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu), 2160 __vector_bitcast<_UInt>(__ix) 2161 >> (__vector_bitcast<_UInt>(__iy) >> 16))); 2162 else if constexpr (__have_sse4_1) 2164 auto __mask = __vector_bitcast<_UShort>(__iy); 2165 auto __x128 = __vector_bitcast<_Up>(__ix); 2167 __mask = (__mask << 3) | (__mask << 11); 2168 // do __x128 = 0 where __y[4] is set 2169 __x128 = __vector_bitcast<_Up>( 2170 _mm_blendv_epi8(__to_intrin(__x128), __m128i(), 2171 __to_intrin(__mask))); 2172 // do __x128 =>> 8 where __y[3] is set 2173 __x128 = __vector_bitcast<_Up>( 2174 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8), 2175 __to_intrin(__mask += __mask))); 2176 // do __x128 =>> 4 where __y[2] is set 2177 __x128 = __vector_bitcast<_Up>( 2178 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4), 2179 __to_intrin(__mask += __mask))); 2180 // do __x128 =>> 2 where __y[1] is set 2181 __x128 = __vector_bitcast<_Up>( 2182 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2), 2183 __to_intrin(__mask += __mask))); 2184 // do __x128 =>> 1 where __y[0] is set 2185 return __intrin_bitcast<_V>( 2186 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1), 2187 __to_intrin(__mask + __mask))); 2191 auto __k = __vector_bitcast<_UShort>(__iy) << 11; 2192 auto __x128 = __vector_bitcast<_Up>(__ix); 2194 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2195 return __vector_bitcast<short>(__kk) < 0; 2197 // do __x128 = 0 where __y[4] is set 2198 __x128 = __mask(__k) ? decltype(__x128)() : __x128; 2199 // do __x128 =>> 8 where __y[3] is set 2200 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128; 2201 // do __x128 =>> 4 where __y[2] is set 2202 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128; 2203 // do __x128 =>> 2 where __y[1] is set 2204 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128; 2205 // do __x128 =>> 1 where __y[0] is set 2206 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1 2210 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{ 2212 if constexpr (is_unsigned_v<_Up>) 2214 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31 2215 const __m128 __factor_f = reinterpret_cast<__m128>( 2216 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23)); 2217 const __m128i __factor 2218 = __builtin_constant_p(__factor_f) 2220 __make_vector<unsigned>(__factor_f[0], __factor_f[1], 2221 __factor_f[2], __factor_f[3])) 2222 : _mm_cvttps_epi32(__factor_f); 2224 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31); 2225 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4), 2226 _mm_srli_si128(__factor, 4)); 2227 if constexpr (__have_sse4_1) 2228 return __intrin_bitcast<_V>( 2229 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33)); 2231 return __intrin_bitcast<_V>( 2232 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4)); 2236 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2237 if constexpr (is_signed_v<_Up>) 2238 return _mm_sra_epi32(__a, __b); 2240 return _mm_srl_epi32(__a, __b); 2243 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i())); 2244 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32)); 2246 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i())); 2247 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12)); 2248 if constexpr (__have_sse4_1) 2249 return __intrin_bitcast<_V>( 2250 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3), 2251 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0)); 2253 return __intrin_bitcast<_V>(_mm_unpacklo_epi64( 2254 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)), 2255 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4)))); 2261 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 2266 template <typename _Tp, size_t _Np> 2267 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2268 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2270 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2272 if (__builtin_is_constant_evaluated() 2273 || (__x._M_is_constprop() && __y._M_is_constprop())) 2274 return _MaskImpl::_S_to_bits( 2275 __as_wrapper<_Np>(__x._M_data == __y._M_data)); 2277 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2278 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2279 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2280 if constexpr (is_floating_point_v<_Tp>) 2282 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2283 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2284 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2285 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2286 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2287 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2288 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2289 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2290 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2291 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2292 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2293 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2295 __assert_unreachable<_Tp>(); 2297 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2298 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2299 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2300 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2301 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2302 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2303 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2304 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2306 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2308 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2309 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2310 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2311 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2312 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2313 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2314 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2315 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2316 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2317 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2318 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2319 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2320 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2322 __assert_unreachable<_Tp>(); 2324 else if (__builtin_is_constant_evaluated()) 2325 return _Base::_S_equal_to(__x, __y); 2326 else if constexpr (sizeof(__x) == 8) 2328 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2329 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2330 _MaskMember<_Tp> __r64{}; 2331 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2335 return _Base::_S_equal_to(__x, __y); 2339 // _S_not_equal_to {{{ 2340 template <typename _Tp, size_t _Np> 2341 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2342 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2344 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2346 if (__builtin_is_constant_evaluated() 2347 || (__x._M_is_constprop() && __y._M_is_constprop())) 2348 return _MaskImpl::_S_to_bits( 2349 __as_wrapper<_Np>(__x._M_data != __y._M_data)); 2351 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2352 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2353 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2354 if constexpr (is_floating_point_v<_Tp>) 2356 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2357 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2358 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2359 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2360 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2361 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2362 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2363 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2364 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2365 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2366 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2367 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2369 __assert_unreachable<_Tp>(); 2371 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2372 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2373 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2374 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2375 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2376 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2377 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2378 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2379 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2380 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2381 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2382 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2383 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2384 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2385 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2386 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2387 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2388 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2389 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2390 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2391 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2392 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2393 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2394 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2396 __assert_unreachable<_Tp>(); 2398 else if (__builtin_is_constant_evaluated()) 2399 return _Base::_S_not_equal_to(__x, __y); 2400 else if constexpr (sizeof(__x) == 8) 2402 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2403 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2404 _MaskMember<_Tp> __r64{}; 2405 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2409 return _Base::_S_not_equal_to(__x, __y); 2414 template <typename _Tp, size_t _Np> 2415 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2416 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2418 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2420 if (__builtin_is_constant_evaluated() 2421 || (__x._M_is_constprop() && __y._M_is_constprop())) 2422 return _MaskImpl::_S_to_bits( 2423 __as_wrapper<_Np>(__x._M_data < __y._M_data)); 2425 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2426 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2427 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2428 if constexpr (sizeof(__xi) == 64) 2430 if constexpr (is_same_v<_Tp, float>) 2431 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2432 else if constexpr (is_same_v<_Tp, double>) 2433 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2434 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2435 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2436 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2437 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2438 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2439 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2440 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2441 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2442 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2443 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2444 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2445 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2446 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2447 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2448 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2449 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2451 __assert_unreachable<_Tp>(); 2453 else if constexpr (sizeof(__xi) == 32) 2455 if constexpr (is_same_v<_Tp, float>) 2456 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2457 else if constexpr (is_same_v<_Tp, double>) 2458 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2459 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2460 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2461 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2462 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2463 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2464 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2465 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2466 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2467 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2468 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2469 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2470 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2471 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2472 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2473 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2474 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2476 __assert_unreachable<_Tp>(); 2478 else if constexpr (sizeof(__xi) == 16) 2480 if constexpr (is_same_v<_Tp, float>) 2481 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2482 else if constexpr (is_same_v<_Tp, double>) 2483 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2484 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2485 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2486 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2487 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2488 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2489 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2490 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2491 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2492 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2493 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2494 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2495 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2496 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2497 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2498 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2499 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2501 __assert_unreachable<_Tp>(); 2504 __assert_unreachable<_Tp>(); 2506 else if (__builtin_is_constant_evaluated()) 2507 return _Base::_S_less(__x, __y); 2508 else if constexpr (sizeof(__x) == 8) 2510 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2511 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2512 _MaskMember<_Tp> __r64{}; 2513 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2517 return _Base::_S_less(__x, __y); 2521 // _S_less_equal {{{ 2522 template <typename _Tp, size_t _Np> 2523 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2524 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2526 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2528 if (__builtin_is_constant_evaluated() 2529 || (__x._M_is_constprop() && __y._M_is_constprop())) 2530 return _MaskImpl::_S_to_bits( 2531 __as_wrapper<_Np>(__x._M_data <= __y._M_data)); 2533 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2534 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2535 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2536 if constexpr (sizeof(__xi) == 64) 2538 if constexpr (is_same_v<_Tp, float>) 2539 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2540 else if constexpr (is_same_v<_Tp, double>) 2541 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2542 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2543 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi); 2544 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2545 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi); 2546 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2547 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi); 2548 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2549 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi); 2550 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2551 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi); 2552 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2553 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi); 2554 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2555 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi); 2556 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2557 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi); 2559 __assert_unreachable<_Tp>(); 2561 else if constexpr (sizeof(__xi) == 32) 2563 if constexpr (is_same_v<_Tp, float>) 2564 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2565 else if constexpr (is_same_v<_Tp, double>) 2566 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2567 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2568 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi); 2569 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2570 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi); 2571 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2572 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi); 2573 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2574 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi); 2575 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2576 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi); 2577 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2578 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi); 2579 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2580 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi); 2581 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2582 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi); 2584 __assert_unreachable<_Tp>(); 2586 else if constexpr (sizeof(__xi) == 16) 2588 if constexpr (is_same_v<_Tp, float>) 2589 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2590 else if constexpr (is_same_v<_Tp, double>) 2591 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2592 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2593 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi); 2594 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2595 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi); 2596 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2597 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi); 2598 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2599 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi); 2600 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2601 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi); 2602 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2603 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi); 2604 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2605 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi); 2606 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2607 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi); 2609 __assert_unreachable<_Tp>(); 2612 __assert_unreachable<_Tp>(); 2614 else if (__builtin_is_constant_evaluated()) 2615 return _Base::_S_less_equal(__x, __y); 2616 else if constexpr (sizeof(__x) == 8) 2618 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2619 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2620 _MaskMember<_Tp> __r64{}; 2621 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2625 return _Base::_S_less_equal(__x, __y); 2630 template <typename _Tp, size_t _Np> 2631 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2632 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 2634 if constexpr (__is_avx512_abi<_Abi>()) 2635 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>()); 2637 return _Base::_S_negate(__x); 2642 using _Base::_S_abs; 2645 template <typename _Tp, size_t _Np> 2646 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2647 _S_sqrt(_SimdWrapper<_Tp, _Np> __x) 2649 if constexpr (__is_sse_ps<_Tp, _Np>()) 2650 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x))); 2651 else if constexpr (__is_sse_pd<_Tp, _Np>()) 2652 return _mm_sqrt_pd(__x); 2653 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2654 return _mm256_sqrt_ps(__x); 2655 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2656 return _mm256_sqrt_pd(__x); 2657 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 2658 return _mm512_sqrt_ps(__x); 2659 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2660 return _mm512_sqrt_pd(__x); 2662 __assert_unreachable<_Tp>(); 2667 template <typename _Tp, size_t _Np> 2668 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2669 _S_ldexp(_SimdWrapper<_Tp, _Np> __x, 2670 __fixed_size_storage_t<int, _Np> __exp) 2672 if constexpr (sizeof(__x) == 64 || __have_avx512vl) 2674 const auto __xi = __to_intrin(__x); 2675 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi> 2677 const auto __expi = __to_intrin(__cvt(__exp)); 2678 using _Up = __bool_storage_member_type_t<_Np>; 2679 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up(); 2680 if constexpr (sizeof(__xi) == 16) 2682 if constexpr (sizeof(_Tp) == 8) 2683 return _mm_maskz_scalef_pd(__k1, __xi, __expi); 2685 return _mm_maskz_scalef_ps(__k1, __xi, __expi); 2687 else if constexpr (sizeof(__xi) == 32) 2689 if constexpr (sizeof(_Tp) == 8) 2690 return _mm256_maskz_scalef_pd(__k1, __xi, __expi); 2692 return _mm256_maskz_scalef_ps(__k1, __xi, __expi); 2696 static_assert(sizeof(__xi) == 64); 2697 if constexpr (sizeof(_Tp) == 8) 2698 return _mm512_maskz_scalef_pd(__k1, __xi, __expi); 2700 return _mm512_maskz_scalef_ps(__k1, __xi, __expi); 2704 return _Base::_S_ldexp(__x, __exp); 2709 template <typename _Tp, size_t _Np> 2710 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2711 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2713 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2714 return _mm512_roundscale_ps(__x, 0x0b); 2715 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2716 return _mm512_roundscale_pd(__x, 0x0b); 2717 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2718 return _mm256_round_ps(__x, 0xb); 2719 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2720 return _mm256_round_pd(__x, 0xb); 2721 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2722 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb)); 2723 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2724 return _mm_round_pd(__x, 0xb); 2725 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2728 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))); 2729 const auto __no_fractional_values 2730 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x)) 2732 < 0x4b000000; // the exponent is so large that no mantissa bits 2733 // signify fractional values (0x3f8 + 23*8 = 2735 return __no_fractional_values ? __truncated : __to_intrin(__x); 2738 return _Base::_S_trunc(__x); 2743 template <typename _Tp, size_t _Np> 2744 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2745 _S_round(_SimdWrapper<_Tp, _Np> __x) 2747 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away 2748 // from zero as required by std::round. Therefore this function is more 2750 using _V = __vector_type_t<_Tp, _Np>; 2752 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2753 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b); 2754 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2755 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b); 2756 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2757 __truncated = _mm256_round_ps(__x._M_data, 2758 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2759 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2760 __truncated = _mm256_round_pd(__x._M_data, 2761 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2762 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2763 __truncated = __auto_bitcast( 2764 _mm_round_ps(__to_intrin(__x), 2765 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); 2766 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2768 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2769 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2770 __truncated = __auto_bitcast( 2771 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)))); 2773 return _Base::_S_round(__x); 2775 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0 2776 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0 2780 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5) 2781 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1) 2783 if constexpr (__have_sse4_1) 2785 else // adjust for missing range in cvttps_epi32 2786 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded 2792 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2793 _GLIBCXX_SIMD_INTRINSIC static _Tp 2794 _S_nearbyint(_Tp __x) noexcept 2796 if constexpr (_TVT::template _S_is<float, 16>) 2797 return _mm512_roundscale_ps(__x, 0x0c); 2798 else if constexpr (_TVT::template _S_is<double, 8>) 2799 return _mm512_roundscale_pd(__x, 0x0c); 2800 else if constexpr (_TVT::template _S_is<float, 8>) 2801 return _mm256_round_ps(__x, 2802 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2803 else if constexpr (_TVT::template _S_is<double, 4>) 2804 return _mm256_round_pd(__x, 2805 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2806 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2807 return _mm_round_ps(__x, 2808 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2809 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2810 return _mm_round_pd(__x, 2811 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2813 return _Base::_S_nearbyint(__x); 2818 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>> 2819 _GLIBCXX_SIMD_INTRINSIC static _Tp 2820 _S_rint(_Tp __x) noexcept 2822 if constexpr (_TVT::template _S_is<float, 16>) 2823 return _mm512_roundscale_ps(__x, 0x04); 2824 else if constexpr (_TVT::template _S_is<double, 8>) 2825 return _mm512_roundscale_pd(__x, 0x04); 2826 else if constexpr (_TVT::template _S_is<float, 8>) 2827 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2828 else if constexpr (_TVT::template _S_is<double, 4>) 2829 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2830 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>) 2831 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2832 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>) 2833 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2835 return _Base::_S_rint(__x); 2840 template <typename _Tp, size_t _Np> 2841 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2842 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2844 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2845 return _mm512_roundscale_ps(__x, 0x09); 2846 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2847 return _mm512_roundscale_pd(__x, 0x09); 2848 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2849 return _mm256_round_ps(__x, 0x9); 2850 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2851 return _mm256_round_pd(__x, 0x9); 2852 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2853 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9)); 2854 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2855 return _mm_round_pd(__x, 0x9); 2857 return _Base::_S_floor(__x); 2862 template <typename _Tp, size_t _Np> 2863 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2864 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2866 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2867 return _mm512_roundscale_ps(__x, 0x0a); 2868 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2869 return _mm512_roundscale_pd(__x, 0x0a); 2870 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2871 return _mm256_round_ps(__x, 0xa); 2872 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2873 return _mm256_round_pd(__x, 0xa); 2874 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2875 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa)); 2876 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2877 return _mm_round_pd(__x, 0xa); 2879 return _Base::_S_ceil(__x); 2884 template <typename _Tp, size_t _Np> 2885 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2886 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2888 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2890 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4) 2891 return _mm512_movepi32_mask( 2892 __intrin_bitcast<__m512i>(__x._M_data)); 2893 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8) 2894 return _mm512_movepi64_mask( 2895 __intrin_bitcast<__m512i>(__x._M_data)); 2896 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4) 2897 return _mm256_movepi32_mask( 2898 __intrin_bitcast<__m256i>(__x._M_data)); 2899 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8) 2900 return _mm256_movepi64_mask( 2901 __intrin_bitcast<__m256i>(__x._M_data)); 2902 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4) 2903 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2904 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8) 2905 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2907 else if constexpr (__is_avx512_abi<_Abi>()) 2909 const auto __xi = __to_intrin(__x); 2910 [[maybe_unused]] constexpr auto __k1 2911 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2912 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2913 return _mm_movemask_ps(__xi); 2914 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2915 return _mm_movemask_pd(__xi); 2916 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2917 return _mm256_movemask_ps(__xi); 2918 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2919 return _mm256_movemask_pd(__xi); 2920 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2921 return _mm512_mask_cmplt_epi32_mask( 2922 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2923 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2924 return _mm512_mask_cmplt_epi64_mask( 2925 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2927 __assert_unreachable<_Tp>(); 2930 return _Base::_S_signbit(__x); 2932 using _I = __int_for_sizeof_t<_Tp>; 2933 if constexpr (sizeof(__x) == 64) 2934 return _S_less(__vector_bitcast<_I>(__x), _I()); 2937 const auto __xx = __vector_bitcast<_I>(__x._M_data); 2938 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>; 2939 if constexpr ((sizeof(_Tp) == 4 && 2940 (__have_avx2 || sizeof(__x) == 16)) || 2943 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>); 2945 else if constexpr ((__have_avx2 || 2946 (__have_ssse3 && sizeof(__x) == 16))) 2948 return __vector_bitcast<_Tp>((__xx & __signmask) == 2952 { // SSE2/3 or AVX (w/o AVX2) 2953 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1); 2954 return __vector_bitcast<_Tp>( 2955 __vector_bitcast<_Tp>( 2956 (__xx & __signmask) | 2957 __vector_bitcast<_I>(__one)) // -1 or 1 2965 // _S_isnonzerovalue_mask {{{ 2966 // (isnormal | is subnormal == !isinf & !isnan & !is zero) 2967 template <typename _Tp> 2968 _GLIBCXX_SIMD_INTRINSIC static auto 2969 _S_isnonzerovalue_mask(_Tp __x) 2971 using _Traits = _VectorTraits<_Tp>; 2972 if constexpr (__have_avx512dq_vl) 2974 if constexpr (_Traits::template _S_is< 2975 float, 2> || _Traits::template _S_is<float, 4>) 2976 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f)); 2977 else if constexpr (_Traits::template _S_is<float, 8>) 2978 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f)); 2979 else if constexpr (_Traits::template _S_is<float, 16>) 2980 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f)); 2981 else if constexpr (_Traits::template _S_is<double, 2>) 2982 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f)); 2983 else if constexpr (_Traits::template _S_is<double, 4>) 2984 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f)); 2985 else if constexpr (_Traits::template _S_is<double, 8>) 2986 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f)); 2988 __assert_unreachable<_Tp>(); 2992 using _Up = typename _Traits::value_type; 2993 constexpr size_t _Np = _Traits::_S_full_size; 2994 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0 2995 const auto __b = __x * _Up(); // NaN if __x == inf 2996 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>()) 2997 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b), 2999 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>()) 3001 & _mm512_cmp_ps_mask(__auto_bitcast(__a), 3002 __auto_bitcast(__b), 3004 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>()) 3005 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 3006 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>()) 3008 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 3009 __auto_bitcast(__b), 3011 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>()) 3012 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 3013 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>()) 3014 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a), 3015 __auto_bitcast(__b), 3017 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>()) 3018 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 3019 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>()) 3021 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 3022 __auto_bitcast(__b), 3024 else if constexpr (__is_avx512_ps<_Up, _Np>()) 3025 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 3026 else if constexpr (__is_avx512_pd<_Up, _Np>()) 3027 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 3029 __assert_unreachable<_Tp>(); 3035 template <typename _Tp, size_t _Np> 3036 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3037 _S_isfinite(_SimdWrapper<_Tp, _Np> __x) 3039 static_assert(is_floating_point_v<_Tp>); 3040 #if !__FINITE_MATH_ONLY__ 3041 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3043 const auto __xi = __to_intrin(__x); 3044 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3045 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3046 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3047 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3048 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3049 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3050 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3051 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3052 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3053 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3054 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3055 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3056 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3058 else if constexpr (__is_avx512_abi<_Abi>()) 3060 // if all exponent bits are set, __x is either inf or NaN 3061 using _I = __int_for_sizeof_t<_Tp>; 3062 const auto __inf = __vector_bitcast<_I>( 3063 __vector_broadcast<_Np>(__infinity_v<_Tp>)); 3064 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf); 3068 return _Base::_S_isfinite(__x); 3073 template <typename _Tp, size_t _Np> 3074 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3075 _S_isinf(_SimdWrapper<_Tp, _Np> __x) 3077 #if !__FINITE_MATH_ONLY__ 3078 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3080 const auto __xi = __to_intrin(__x); 3081 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3082 return _mm512_fpclass_ps_mask(__xi, 0x18); 3083 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3084 return _mm512_fpclass_pd_mask(__xi, 0x18); 3085 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3086 return _mm256_fpclass_ps_mask(__xi, 0x18); 3087 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3088 return _mm256_fpclass_pd_mask(__xi, 0x18); 3089 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3090 return _mm_fpclass_ps_mask(__xi, 0x18); 3091 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3092 return _mm_fpclass_pd_mask(__xi, 0x18); 3094 __assert_unreachable<_Tp>(); 3096 else if constexpr (__have_avx512dq_vl) 3098 if constexpr (__is_sse_pd<_Tp, _Np>()) 3099 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18)); 3100 else if constexpr (__is_avx_pd<_Tp, _Np>()) 3101 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18)); 3102 else if constexpr (__is_sse_ps<_Tp, _Np>()) 3103 return _mm_movm_epi32( 3104 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18)); 3105 else if constexpr (__is_avx_ps<_Tp, _Np>()) 3106 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18)); 3108 __assert_unreachable<_Tp>(); 3112 return _Base::_S_isinf(__x); 3117 template <typename _Tp, size_t _Np> 3118 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3119 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 3121 #if __FINITE_MATH_ONLY__ 3122 [[maybe_unused]] constexpr int __mode = 0x26; 3124 [[maybe_unused]] constexpr int __mode = 0xbf; 3126 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3128 const auto __xi = __to_intrin(__x); 3129 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3130 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3131 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode); 3132 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3133 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode); 3134 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3135 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode); 3136 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3137 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode); 3138 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3139 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode); 3140 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3141 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode); 3143 __assert_unreachable<_Tp>(); 3145 else if constexpr (__have_avx512dq) 3147 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) 3148 return _mm_movm_epi32( 3149 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode))); 3150 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) 3151 return _mm256_movm_epi32( 3152 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode))); 3153 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 3154 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode)); 3155 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) 3156 return _mm_movm_epi64( 3157 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode))); 3158 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) 3159 return _mm256_movm_epi64( 3160 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode))); 3161 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 3162 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode)); 3164 __assert_unreachable<_Tp>(); 3166 else if constexpr (__is_avx512_abi<_Abi>()) 3168 using _I = __int_for_sizeof_t<_Tp>; 3169 const auto absn = __vector_bitcast<_I>(_S_abs(__x)); 3170 const auto minn = __vector_bitcast<_I>( 3171 __vector_broadcast<_Np>(__norm_min_v<_Tp>)); 3172 #if __FINITE_MATH_ONLY__ 3173 return _S_less_equal<_I, _Np>(minn, absn); 3176 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 3177 return __and(_S_less_equal<_I, _Np>(minn, absn), 3178 _S_less<_I, _Np>(absn, infn)); 3182 return _Base::_S_isnormal(__x); 3187 template <typename _Tp, size_t _Np> 3188 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3189 _S_isnan(_SimdWrapper<_Tp, _Np> __x) 3190 { return _S_isunordered(__x, __x); } 3193 // _S_isunordered {{{ 3194 template <typename _Tp, size_t _Np> 3195 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3196 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x, 3197 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y) 3199 #if __FINITE_MATH_ONLY__ 3202 const auto __xi = __to_intrin(__x); 3203 const auto __yi = __to_intrin(__y); 3204 if constexpr (__is_avx512_abi<_Abi>()) 3206 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3207 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3208 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3209 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3210 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3211 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3212 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3213 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3214 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3215 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3216 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3217 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3218 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3220 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3221 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q)); 3222 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3223 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q)); 3224 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3225 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi)); 3226 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3227 return __to_masktype(_mm_cmpunord_pd(__xi, __yi)); 3229 __assert_unreachable<_Tp>(); 3235 template <typename _Tp, size_t _Np> 3236 static constexpr _MaskMember<_Tp> 3237 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3239 const auto __xi = __to_intrin(__x); 3240 const auto __yi = __to_intrin(__y); 3241 if constexpr (__is_avx512_abi<_Abi>()) 3243 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3244 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3245 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3246 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3247 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3248 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3249 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3250 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3251 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3252 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3253 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3254 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3255 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3257 __assert_unreachable<_Tp>(); 3259 else if constexpr (__have_avx) 3261 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3262 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3263 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3264 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3265 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3266 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3267 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3268 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3270 __assert_unreachable<_Tp>(); 3272 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3273 && sizeof(_Tp) == 4) 3275 const auto __xn = __vector_bitcast<int>(__xi); 3276 const auto __yn = __vector_bitcast<int>(__yi); 3277 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3278 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3279 return __auto_bitcast( 3280 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp)); 3282 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3283 && sizeof(_Tp) == 8) 3284 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3285 -_mm_ucomigt_sd(__xi, __yi), 3286 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi), 3287 _mm_unpackhi_pd(__yi, __yi))}; 3289 return _Base::_S_isgreater(__x, __y); 3293 // _S_isgreaterequal {{{ 3294 template <typename _Tp, size_t _Np> 3295 static constexpr _MaskMember<_Tp> 3296 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3298 const auto __xi = __to_intrin(__x); 3299 const auto __yi = __to_intrin(__y); 3300 if constexpr (__is_avx512_abi<_Abi>()) 3302 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3303 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3304 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3305 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3306 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3308 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3309 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3310 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3311 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3312 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3313 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3314 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3316 __assert_unreachable<_Tp>(); 3318 else if constexpr (__have_avx) 3320 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3321 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3322 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3323 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3324 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3325 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3326 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3327 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3329 __assert_unreachable<_Tp>(); 3331 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3332 && sizeof(_Tp) == 4) 3334 const auto __xn = __vector_bitcast<int>(__xi); 3335 const auto __yn = __vector_bitcast<int>(__yi); 3336 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3337 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3338 return __auto_bitcast( 3339 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp)); 3341 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3342 && sizeof(_Tp) == 8) 3343 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3344 -_mm_ucomige_sd(__xi, __yi), 3345 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi), 3346 _mm_unpackhi_pd(__yi, __yi))}; 3348 return _Base::_S_isgreaterequal(__x, __y); 3353 template <typename _Tp, size_t _Np> 3354 static constexpr _MaskMember<_Tp> 3355 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3357 const auto __xi = __to_intrin(__x); 3358 const auto __yi = __to_intrin(__y); 3359 if constexpr (__is_avx512_abi<_Abi>()) 3361 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3362 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3363 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3364 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3365 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3366 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3367 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3368 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3369 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3370 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3371 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3372 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3373 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3375 __assert_unreachable<_Tp>(); 3377 else if constexpr (__have_avx) 3379 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3380 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3381 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3382 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3383 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3384 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3385 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3386 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3388 __assert_unreachable<_Tp>(); 3390 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3391 && sizeof(_Tp) == 4) 3393 const auto __xn = __vector_bitcast<int>(__xi); 3394 const auto __yn = __vector_bitcast<int>(__yi); 3395 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3396 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3397 return __auto_bitcast( 3398 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp)); 3400 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3401 && sizeof(_Tp) == 8) 3402 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3403 -_mm_ucomigt_sd(__yi, __xi), 3404 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi), 3405 _mm_unpackhi_pd(__xi, __xi))}; 3407 return _Base::_S_isless(__x, __y); 3411 // _S_islessequal {{{ 3412 template <typename _Tp, size_t _Np> 3413 static constexpr _MaskMember<_Tp> 3414 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3416 const auto __xi = __to_intrin(__x); 3417 const auto __yi = __to_intrin(__y); 3418 if constexpr (__is_avx512_abi<_Abi>()) 3420 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3421 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3422 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3423 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3424 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3425 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3426 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3427 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3428 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3429 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3430 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3431 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3432 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3434 __assert_unreachable<_Tp>(); 3436 else if constexpr (__have_avx) 3438 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3439 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3440 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3441 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3442 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3443 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3444 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3445 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3447 __assert_unreachable<_Tp>(); 3449 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3450 && sizeof(_Tp) == 4) 3452 const auto __xn = __vector_bitcast<int>(__xi); 3453 const auto __yn = __vector_bitcast<int>(__yi); 3454 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3455 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3456 return __auto_bitcast( 3457 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp)); 3459 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3460 && sizeof(_Tp) == 8) 3461 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3462 -_mm_ucomige_sd(__yi, __xi), 3463 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi), 3464 _mm_unpackhi_pd(__xi, __xi))}; 3466 return _Base::_S_islessequal(__x, __y); 3470 // _S_islessgreater {{{ 3471 template <typename _Tp, size_t _Np> 3472 static constexpr _MaskMember<_Tp> 3473 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3475 const auto __xi = __to_intrin(__x); 3476 const auto __yi = __to_intrin(__y); 3477 if constexpr (__is_avx512_abi<_Abi>()) 3479 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3480 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3481 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3482 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3483 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3484 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3485 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3486 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3487 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3488 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3489 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3490 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3491 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3493 __assert_unreachable<_Tp>(); 3495 else if constexpr (__have_avx) 3497 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3498 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3499 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3500 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3501 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3502 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3503 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3504 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3506 __assert_unreachable<_Tp>(); 3508 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3509 return __auto_bitcast( 3510 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi))); 3511 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3512 return __to_masktype( 3513 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi))); 3515 __assert_unreachable<_Tp>(); 3519 template <template <typename> class _Op, typename _Tp, typename _K, size_t _Np> 3520 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 3521 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v) 3523 if (__k._M_is_constprop_none_of()) 3525 else if (__k._M_is_constprop_all_of()) 3527 auto __vv = _Base::_M_make_simd(__v); 3528 _Op<decltype(__vv)> __op; 3529 return __data(__op(__vv)); 3531 else if constexpr (__is_bitmask_v<decltype(__k)> 3532 && (is_same_v<_Op<void>, __increment<void>> 3533 || is_same_v<_Op<void>, __decrement<void>>)) 3535 // optimize masked unary increment and decrement as masked sub +/-1 3536 constexpr int __pm_one 3537 = is_same_v<_Op<void>, __increment<void>> ? -1 : 1; 3539 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data; 3541 if constexpr (is_integral_v<_Tp>) 3543 constexpr bool __lp64 = sizeof(long) == sizeof(long long); 3544 using _Ip = std::make_signed_t<_Tp>; 3545 using _Up = std::conditional_t< 3546 std::is_same_v<_Ip, long>, 3547 std::conditional_t<__lp64, long long, int>, 3549 std::is_same_v<_Ip, signed char>, char, _Ip>>; 3550 const auto __value = __vector_bitcast<_Up>(__v._M_data); 3551 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3552 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \ 3553 return __vector_bitcast<_Tp>(__builtin_ia32_##_Instr##_mask(__value, \ 3554 __vector_broadcast<_Np>(_Up(__pm_one)), __value, __k._M_data)) 3555 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512); 3556 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256); 3557 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128); 3558 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512); 3559 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256); 3560 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128); 3561 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512); 3562 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256); 3563 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128); 3564 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512); 3565 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256); 3566 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128); 3567 #undef _GLIBCXX_SIMD_MASK_SUB 3571 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3572 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \ 3573 return __builtin_ia32_##_Instr##_mask( \ 3574 __v._M_data, __vector_broadcast<_Np>(_Tp(__pm_one)), __v._M_data, \ 3575 __k._M_data, _MM_FROUND_CUR_DIRECTION) 3576 _GLIBCXX_SIMD_MASK_SUB(4, 64, subps512); 3577 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256); 3578 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128); 3579 _GLIBCXX_SIMD_MASK_SUB(8, 64, subpd512); 3580 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256); 3581 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128); 3582 #undef _GLIBCXX_SIMD_MASK_SUB 3587 return _Base::template _S_masked_unary<_Op>(__k, __v); 3592 // _MaskImplX86Mixin {{{ 3593 struct _MaskImplX86Mixin 3595 template <typename _Tp> 3596 using _TypeTag = _Tp*; 3598 using _Base = _MaskImplBuiltinMixin; 3600 // _S_to_maskvector(bool) {{{ 3601 template <typename _Up, size_t _ToN = 1, typename _Tp> 3602 _GLIBCXX_SIMD_INTRINSIC static constexpr 3603 enable_if_t<is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>> 3604 _S_to_maskvector(_Tp __x) 3606 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3607 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 3608 : __vector_type_t<_Up, _ToN>(); 3612 // _S_to_maskvector(_SanitizedBitMask) {{{ 3613 template <typename _Up, size_t _UpN = 0, size_t _Np, size_t _ToN = _UpN == 0 ? _Np : _UpN> 3614 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3615 _S_to_maskvector(_SanitizedBitMask<_Np> __x) 3617 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3618 using _UV = __vector_type_t<_Up, _ToN>; 3619 using _UI = __intrinsic_type_t<_Up, _ToN>; 3620 [[maybe_unused]] const auto __k = __x._M_to_bits(); 3621 if constexpr (_Np == 1) 3622 return _S_to_maskvector<_Up, _ToN>(__k); 3623 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3624 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>( 3625 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; }); 3626 else if constexpr (sizeof(_Up) == 1) 3628 if constexpr (sizeof(_UI) == 16) 3630 if constexpr (__have_avx512bw_vl) 3631 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k)); 3632 else if constexpr (__have_avx512bw) 3633 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k))); 3634 else if constexpr (__have_avx512f) 3636 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3638 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3639 __hi256(__as32bits))); 3640 return __intrin_bitcast<_UV>( 3641 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 3643 else if constexpr (__have_ssse3) 3645 const auto __bitmask = __to_intrin( 3646 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 3647 8, 16, 32, 64, 128)); 3648 return __intrin_bitcast<_UV>( 3649 __vector_bitcast<_Up>( 3650 _mm_shuffle_epi8(__to_intrin( 3651 __vector_type_t<_ULLong, 2>{__k}), 3652 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3653 1, 1, 1, 1, 1, 1, 1)) 3657 // else fall through 3659 else if constexpr (sizeof(_UI) == 32) 3661 if constexpr (__have_avx512bw_vl) 3662 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k)); 3663 else if constexpr (__have_avx512bw) 3664 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k))); 3665 else if constexpr (__have_avx512f) 3667 auto __as16bits = // 0 16 1 17 ... 15 31 3668 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()), 3670 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16, 3673 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16( 3674 __lo256(__as16bits), 3675 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ... 3678 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8( 3679 __0_16_1_17, // 0 16 1 17 2 ... 3680 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 3681 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3683 15)))); // 0-7 16-23 8-15 24-31 -> xzyw 3684 // 0-3 8-11 16-19 24-27 3685 // 4-7 12-15 20-23 28-31 3687 else if constexpr (__have_avx2) 3689 const auto __bitmask 3690 = _mm256_broadcastsi128_si256(__to_intrin( 3691 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 3692 4, 8, 16, 32, 64, 128))); 3693 return __vector_bitcast<_Up>( 3694 __vector_bitcast<_Up>( 3695 _mm256_shuffle_epi8( 3696 _mm256_broadcastsi128_si256( 3697 __to_intrin(__vector_type_t<_ULLong, 2>{__k})), 3698 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3699 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3704 // else fall through 3706 else if constexpr (sizeof(_UI) == 64) 3707 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k)); 3708 if constexpr (std::min(_ToN, _Np) <= 4) 3710 if constexpr (_Np > 7) // avoid overflow 3711 __x &= _SanitizedBitMask<_Np>(0x0f); 3712 const _UInt __char_mask 3713 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL) 3716 __builtin_memcpy(&__r, &__char_mask, 3717 std::min(sizeof(__r), sizeof(__char_mask))); 3720 else if constexpr (std::min(_ToN, _Np) <= 7) 3722 if constexpr (_Np > 7) // avoid overflow 3723 __x &= _SanitizedBitMask<_Np>(0x7f); 3724 const _ULLong __char_mask 3725 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL) 3728 __builtin_memcpy(&__r, &__char_mask, 3729 std::min(sizeof(__r), sizeof(__char_mask))); 3733 else if constexpr (sizeof(_Up) == 2) 3735 if constexpr (sizeof(_UI) == 16) 3737 if constexpr (__have_avx512bw_vl) 3738 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k)); 3739 else if constexpr (__have_avx512bw) 3740 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k))); 3741 else if constexpr (__have_avx512f) 3743 __m256i __as32bits = {}; 3744 if constexpr (__have_avx512vl) 3745 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i()); 3748 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())); 3749 return __intrin_bitcast<_UV>( 3750 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits))); 3752 // else fall through 3754 else if constexpr (sizeof(_UI) == 32) 3756 if constexpr (__have_avx512bw_vl) 3757 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k)); 3758 else if constexpr (__have_avx512bw) 3759 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k))); 3760 else if constexpr (__have_avx512f) 3762 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3763 return __vector_bitcast<_Up>( 3764 __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3765 __hi256(__as32bits)))); 3767 // else fall through 3769 else if constexpr (sizeof(_UI) == 64) 3770 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k)); 3772 else if constexpr (sizeof(_Up) == 4) 3774 if constexpr (sizeof(_UI) == 16) 3776 if constexpr (__have_avx512dq_vl) 3777 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k)); 3778 else if constexpr (__have_avx512dq) 3779 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k))); 3780 else if constexpr (__have_avx512vl) 3781 return __intrin_bitcast<_UV>( 3782 _mm_maskz_mov_epi32(__k, ~__m128i())); 3783 else if constexpr (__have_avx512f) 3784 return __intrin_bitcast<_UV>( 3785 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3786 // else fall through 3788 else if constexpr (sizeof(_UI) == 32) 3790 if constexpr (__have_avx512dq_vl) 3791 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k)); 3792 else if constexpr (__have_avx512dq) 3793 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k))); 3794 else if constexpr (__have_avx512vl) 3795 return __vector_bitcast<_Up>( 3796 _mm256_maskz_mov_epi32(__k, ~__m256i())); 3797 else if constexpr (__have_avx512f) 3798 return __vector_bitcast<_Up>( 3799 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3800 // else fall through 3802 else if constexpr (sizeof(_UI) == 64) 3803 return __vector_bitcast<_Up>( 3804 __have_avx512dq ? _mm512_movm_epi32(__k) 3805 : _mm512_maskz_mov_epi32(__k, ~__m512i())); 3807 else if constexpr (sizeof(_Up) == 8) 3809 if constexpr (sizeof(_UI) == 16) 3811 if constexpr (__have_avx512dq_vl) 3812 return __vector_bitcast<_Up>(_mm_movm_epi64(__k)); 3813 else if constexpr (__have_avx512dq) 3814 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k))); 3815 else if constexpr (__have_avx512vl) 3816 return __vector_bitcast<_Up>( 3817 _mm_maskz_mov_epi64(__k, ~__m128i())); 3818 else if constexpr (__have_avx512f) 3819 return __vector_bitcast<_Up>( 3820 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3821 // else fall through 3823 else if constexpr (sizeof(_UI) == 32) 3825 if constexpr (__have_avx512dq_vl) 3826 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k)); 3827 else if constexpr (__have_avx512dq) 3828 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k))); 3829 else if constexpr (__have_avx512vl) 3830 return __vector_bitcast<_Up>( 3831 _mm256_maskz_mov_epi64(__k, ~__m256i())); 3832 else if constexpr (__have_avx512f) 3833 return __vector_bitcast<_Up>( 3834 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3835 // else fall through 3837 else if constexpr (sizeof(_UI) == 64) 3838 return __vector_bitcast<_Up>( 3839 __have_avx512dq ? _mm512_movm_epi64(__k) 3840 : _mm512_maskz_mov_epi64(__k, ~__m512i())); 3843 using _UpUInt = make_unsigned_t<_Up>; 3844 using _V = __vector_type_t<_UpUInt, _ToN>; 3845 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__; 3846 if constexpr (_ToN == 2) 3848 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])}); 3850 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32) 3852 if constexpr (sizeof(_Up) == 4) 3853 return __vector_bitcast<_Up>(_mm256_cmp_ps( 3854 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)), 3855 _mm256_castsi256_ps(_mm256_setr_epi32( 3856 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))), 3857 _mm256_setzero_ps(), _CMP_NEQ_UQ)); 3858 else if constexpr (sizeof(_Up) == 8) 3859 return __vector_bitcast<_Up>(_mm256_cmp_pd( 3860 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)), 3861 _mm256_castsi256_pd( 3862 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))), 3863 _mm256_setzero_pd(), _CMP_NEQ_UQ)); 3865 __assert_unreachable<_Up>(); 3867 else if constexpr (__bits_per_element >= _ToN) 3869 constexpr auto __bitmask 3870 = __generate_vector<_V>([](auto __i) 3871 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt 3872 { return __i < _ToN ? 1ull << __i : 0; }); 3874 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; 3875 if constexpr (__bits_per_element > _ToN) 3876 return __vector_bitcast<_Up>(__bits) > 0; 3878 return __vector_bitcast<_Up>(__bits != 0); 3883 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3884 return static_cast<_UpUInt>( 3885 __k >> (__bits_per_element * (__i / __bits_per_element))); 3887 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3888 return static_cast<_UpUInt>(1ull 3889 << (__i % __bits_per_element)); 3890 }); // mask bit index 3891 return __intrin_bitcast<_UV>(__tmp != _V()); 3896 // _S_to_maskvector(_SimdWrapper) {{{ 3897 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np, 3898 size_t _ToN = _UpN == 0 ? _Np : _UpN> 3899 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3900 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 3902 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3903 using _TW = _SimdWrapper<_Tp, _Np>; 3904 using _UW = _SimdWrapper<_Up, _ToN>; 3905 using _UI = __intrinsic_type_t<_Up, _ToN>; 3906 if constexpr (is_same_v<_Tp, bool>) // bits -> vector 3907 return _S_to_maskvector<_Up, _ToN>( 3908 _BitMask<_Np>(__x._M_data)._M_sanitized()); 3909 // vector -> vector bitcast 3910 else if constexpr (sizeof(_Up) == sizeof(_Tp) 3911 && sizeof(_TW) == sizeof(_UW)) 3912 return __wrapper_bitcast<_Up, _ToN>( 3915 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x)); 3916 else // vector -> vector {{{ 3918 if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3920 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); 3921 return __generate_from_n_evaluations<std::min(_ToN, _Np), 3922 __vector_type_t<_Up, _ToN>>( 3923 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; }); 3925 using _To = __vector_type_t<_Up, _ToN>; 3926 [[maybe_unused]] constexpr size_t _FromN = _Np; 3927 constexpr int _FromBytes = sizeof(_Tp); 3928 constexpr int _ToBytes = sizeof(_Up); 3929 const auto __k = __x._M_data; 3931 if constexpr (_FromBytes == _ToBytes) 3932 return __intrin_bitcast<_To>(__k); 3933 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16) 3935 if constexpr (_FromBytes == 4 && _ToBytes == 8) 3936 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3937 else if constexpr (_FromBytes == 2 && _ToBytes == 8) 3940 = __vector_bitcast<int>(__interleave128_lo(__k, __k)); 3941 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3943 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3946 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3948 = __vector_bitcast<int>(__interleave128_lo(__y, __y)); 3949 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z)); 3951 else if constexpr (_FromBytes == 8 && _ToBytes == 4 3953 return __intrin_bitcast<_To>( 3954 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3955 else if constexpr (_FromBytes == 8 && _ToBytes == 4) 3956 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k), 3958 else if constexpr (_FromBytes == 2 && _ToBytes == 4) 3959 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3960 else if constexpr (_FromBytes == 1 && _ToBytes == 4) 3963 = __vector_bitcast<short>(__interleave128_lo(__k, __k)); 3964 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3966 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 3968 if constexpr (__have_sse2 && !__have_ssse3) 3969 return __intrin_bitcast<_To>(_mm_packs_epi32( 3970 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()), 3973 return __intrin_bitcast<_To>( 3974 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>( 3975 __vector_bitcast<_Up>(__k))); 3977 else if constexpr (_FromBytes == 4 && _ToBytes == 2) 3978 return __intrin_bitcast<_To>( 3979 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3980 else if constexpr (_FromBytes == 1 && _ToBytes == 2) 3981 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3982 else if constexpr (_FromBytes == 8 && _ToBytes == 1 3984 return __intrin_bitcast<_To>( 3985 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3986 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1, 3987 -1, -1, -1, -1, -1, -1, -1, 3989 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 3992 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3993 __y = _mm_packs_epi32(__y, __m128i()); 3994 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3996 else if constexpr (_FromBytes == 4 && _ToBytes == 1 3998 return __intrin_bitcast<_To>( 3999 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4000 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4001 -1, -1, -1, -1, -1, -1, -1, 4003 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4006 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 4007 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 4009 else if constexpr (_FromBytes == 2 && _ToBytes == 1) 4010 return __intrin_bitcast<_To>( 4011 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())); 4013 __assert_unreachable<_Tp>(); 4015 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32) 4017 if constexpr (_FromBytes == _ToBytes) 4018 __assert_unreachable<_Tp>(); 4019 else if constexpr (_FromBytes == _ToBytes * 2) 4021 const auto __y = __vector_bitcast<_LLong>(__k); 4022 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4023 _mm_packs_epi16(__lo128(__y), __hi128(__y)))); 4025 else if constexpr (_FromBytes == _ToBytes * 4) 4027 const auto __y = __vector_bitcast<_LLong>(__k); 4028 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4029 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4032 else if constexpr (_FromBytes == _ToBytes * 8) 4034 const auto __y = __vector_bitcast<_LLong>(__k); 4035 return __intrin_bitcast<_To>( 4036 _mm256_castsi128_si256(_mm_shuffle_epi8( 4037 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4038 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 4039 -1, -1, -1, -1, -1)))); 4041 else if constexpr (_FromBytes * 2 == _ToBytes) 4043 auto __y = __xzyw(__to_intrin(__k)); 4044 if constexpr (is_floating_point_v< 4045 _Tp> || (!__have_avx2 && _FromBytes == 4)) 4047 const auto __yy = __vector_bitcast<float>(__y); 4048 return __intrin_bitcast<_To>( 4049 _mm256_unpacklo_ps(__yy, __yy)); 4052 return __intrin_bitcast<_To>( 4053 _mm256_unpacklo_epi8(__y, __y)); 4055 else if constexpr (_FromBytes * 4 == _ToBytes) 4058 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4059 __lo128(__vector_bitcast<_LLong>( 4060 __k))); // drops 3/4 of input 4061 return __intrin_bitcast<_To>( 4062 __concat(_mm_unpacklo_epi16(__y, __y), 4063 _mm_unpackhi_epi16(__y, __y))); 4065 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 4068 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4069 __lo128(__vector_bitcast<_LLong>( 4070 __k))); // drops 3/4 of input 4072 = _mm_unpacklo_epi16(__y, 4073 __y); // drops another 1/2 => 7/8 total 4074 return __intrin_bitcast<_To>( 4075 __concat(_mm_unpacklo_epi32(__y, __y), 4076 _mm_unpackhi_epi32(__y, __y))); 4079 __assert_unreachable<_Tp>(); 4081 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16) 4083 if constexpr (_FromBytes == _ToBytes) 4084 return __intrin_bitcast<_To>( 4085 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>( 4086 __zero_extend(__to_intrin(__k)))); 4087 else if constexpr (_FromBytes * 2 == _ToBytes) 4089 return __intrin_bitcast<_To>( 4090 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k), 4091 __vector_bitcast<_LLong>(__k)), 4092 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k), 4093 __vector_bitcast<_LLong>(__k)))); 4095 else if constexpr (_FromBytes * 4 == _ToBytes) 4097 if constexpr (__have_avx2) 4099 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4100 __concat(__vector_bitcast<_LLong>(__k), 4101 __vector_bitcast<_LLong>(__k)), 4102 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 4103 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 4104 6, 6, 7, 7, 7, 7))); 4108 return __intrin_bitcast<_To>(__concat( 4109 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4110 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 4111 2, 2, 2, 2, 3, 3, 3, 3)), 4112 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4113 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5, 4114 6, 6, 6, 6, 7, 7, 7, 4118 else if constexpr (_FromBytes * 8 == _ToBytes) 4120 if constexpr (__have_avx2) 4122 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4123 __concat(__vector_bitcast<_LLong>(__k), 4124 __vector_bitcast<_LLong>(__k)), 4125 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 4126 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4127 3, 3, 3, 3, 3, 3))); 4131 return __intrin_bitcast<_To>(__concat( 4132 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4133 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 4134 1, 1, 1, 1, 1, 1, 1, 1)), 4135 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4136 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 4137 3, 3, 3, 3, 3, 3, 3, 4141 else if constexpr (_FromBytes == _ToBytes * 2) 4142 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4143 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())))); 4144 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 4146 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4147 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4148 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, 4149 -1, -1, -1, -1, -1, -1, -1, 4152 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4154 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4155 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4156 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4157 -1, -1, -1, -1, -1, -1, -1, 4160 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4162 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4163 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4164 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, 4165 -1, -1, -1, -1, -1, -1, -1, 4169 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable
"); 4171 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32) 4173 if constexpr (_FromBytes == _ToBytes) 4175 return __intrin_bitcast<_To>(__lo128(__k)); 4177 else if constexpr (_FromBytes == _ToBytes * 2) 4179 auto __y = __vector_bitcast<_LLong>(__k); 4180 return __intrin_bitcast<_To>( 4181 _mm_packs_epi16(__lo128(__y), __hi128(__y))); 4183 else if constexpr (_FromBytes == _ToBytes * 4) 4185 auto __y = __vector_bitcast<_LLong>(__k); 4186 return __intrin_bitcast<_To>( 4187 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4190 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4192 auto __y = __vector_bitcast<_LLong>(__k); 4193 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 4194 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4195 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, 4198 else if constexpr (_FromBytes * 2 == _ToBytes) 4200 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4201 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4203 else if constexpr (_FromBytes * 4 == _ToBytes) 4205 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4206 __y = _mm_unpacklo_epi8(__y, __y); 4207 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4209 else if constexpr (_FromBytes * 8 == _ToBytes) 4211 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4212 __y = _mm_unpacklo_epi8(__y, __y); 4213 __y = _mm_unpacklo_epi8(__y, __y); 4214 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4217 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable
"); 4220 return _Base::template _S_to_maskvector<_Up, _ToN>(__x); 4222 if constexpr (_FromBytes > _ToBytes) { 4223 const _To __y = __vector_bitcast<_Up>(__k); 4224 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4225 constexpr int _Stride = _FromBytes / _ToBytes; 4226 return _To{__y[(_Is + 1) * _Stride - 1]...}; 4227 }(make_index_sequence<std::min(_ToN, _FromN)>()); 4229 // {0, 0, 1, 1} (_Dups = 2, _Is<4>) 4230 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>) 4231 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>) 4233 return [&] <size_t... _Is> (index_sequence<_Is...>) { 4234 constexpr int __dup = _ToBytes / _FromBytes; 4235 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...}); 4236 }(make_index_sequence<_FromN>()); 4244 template <typename _Tp, size_t _Np> 4245 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 4246 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 4248 if constexpr (is_same_v<_Tp, bool>) 4249 return _BitMask<_Np>(__x._M_data)._M_sanitized(); 4252 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4253 if (__builtin_is_constant_evaluated() 4254 || __builtin_constant_p(__x._M_data)) 4256 const auto __bools = -__x._M_data; 4257 const _ULLong __k = __call_with_n_evaluations<_Np>( 4258 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4259 return (__bits | ...); 4260 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4261 return _ULLong(__bools[+__i]) << __i; 4263 if (__builtin_is_constant_evaluated() 4264 || __builtin_constant_p(__k)) 4267 const auto __xi = __to_intrin(__x); 4268 if constexpr (sizeof(_Tp) == 1) 4269 if constexpr (sizeof(__xi) == 16) 4270 if constexpr (__have_avx512bw_vl) 4271 return _BitMask<_Np>(_mm_movepi8_mask(__xi)); 4272 else // implies SSE2 4273 return _BitMask<_Np>(_mm_movemask_epi8(__xi)); 4274 else if constexpr (sizeof(__xi) == 32) 4275 if constexpr (__have_avx512bw_vl) 4276 return _BitMask<_Np>(_mm256_movepi8_mask(__xi)); 4277 else // implies AVX2 4278 return _BitMask<_Np>(_mm256_movemask_epi8(__xi)); 4279 else // implies AVX512BW 4280 return _BitMask<_Np>(_mm512_movepi8_mask(__xi)); 4282 else if constexpr (sizeof(_Tp) == 2) 4283 if constexpr (sizeof(__xi) == 16) 4284 if constexpr (__have_avx512bw_vl) 4285 return _BitMask<_Np>(_mm_movepi16_mask(__xi)); 4286 else if constexpr (__have_avx512bw) 4287 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4288 else // implies SSE2 4289 return _BitMask<_Np>( 4290 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i()))); 4291 else if constexpr (sizeof(__xi) == 32) 4292 if constexpr (__have_avx512bw_vl) 4293 return _BitMask<_Np>(_mm256_movepi16_mask(__xi)); 4294 else if constexpr (__have_avx512bw) 4295 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4296 else // implies SSE2 4297 return _BitMask<_Np>(_mm_movemask_epi8( 4298 _mm_packs_epi16(__lo128(__xi), __hi128(__xi)))); 4299 else // implies AVX512BW 4300 return _BitMask<_Np>(_mm512_movepi16_mask(__xi)); 4302 else if constexpr (sizeof(_Tp) == 4) 4303 if constexpr (sizeof(__xi) == 16) 4304 if constexpr (__have_avx512dq_vl) 4305 return _BitMask<_Np>(_mm_movepi32_mask(__xi)); 4306 else if constexpr (__have_avx512vl) 4307 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i())); 4308 else if constexpr (__have_avx512dq) 4309 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4310 else if constexpr (__have_avx512f) 4311 return _BitMask<_Np>( 4312 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4314 return _BitMask<_Np>( 4315 _mm_movemask_ps(reinterpret_cast<__m128>(__xi))); 4316 else if constexpr (sizeof(__xi) == 32) 4317 if constexpr (__have_avx512dq_vl) 4318 return _BitMask<_Np>(_mm256_movepi32_mask(__xi)); 4319 else if constexpr (__have_avx512dq) 4320 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4321 else if constexpr (__have_avx512vl) 4322 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i())); 4323 else if constexpr (__have_avx512f) 4324 return _BitMask<_Np>( 4325 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4327 return _BitMask<_Np>( 4328 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi))); 4329 else // implies AVX512?? 4330 if constexpr (__have_avx512dq) 4331 return _BitMask<_Np>(_mm512_movepi32_mask(__xi)); 4332 else // implies AVX512F 4333 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i())); 4335 else if constexpr (sizeof(_Tp) == 8) 4336 if constexpr (sizeof(__xi) == 16) 4337 if constexpr (__have_avx512dq_vl) 4338 return _BitMask<_Np>(_mm_movepi64_mask(__xi)); 4339 else if constexpr (__have_avx512dq) 4340 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4341 else if constexpr (__have_avx512vl) 4342 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i())); 4343 else if constexpr (__have_avx512f) 4344 return _BitMask<_Np>( 4345 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4346 else // implies SSE2 4347 return _BitMask<_Np>( 4348 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi))); 4349 else if constexpr (sizeof(__xi) == 32) 4350 if constexpr (__have_avx512dq_vl) 4351 return _BitMask<_Np>(_mm256_movepi64_mask(__xi)); 4352 else if constexpr (__have_avx512dq) 4353 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4354 else if constexpr (__have_avx512vl) 4355 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i())); 4356 else if constexpr (__have_avx512f) 4357 return _BitMask<_Np>( 4358 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4360 return _BitMask<_Np>( 4361 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi))); 4362 else // implies AVX512?? 4363 if constexpr (__have_avx512dq) 4364 return _BitMask<_Np>(_mm512_movepi64_mask(__xi)); 4365 else // implies AVX512F 4366 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i())); 4369 __assert_unreachable<_Tp>(); 4377 template <typename _Abi, typename> 4378 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi> 4380 using _MaskImplX86Mixin::_S_to_bits; 4381 using _MaskImplX86Mixin::_S_to_maskvector; 4382 using _MaskImplBuiltin<_Abi>::_S_convert; 4385 template <typename _Tp> 4386 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 4388 template <typename _Tp> 4389 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 4391 template <typename _Tp> 4392 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 4394 using _Base = _MaskImplBuiltin<_Abi>; 4398 template <typename _Tp> 4399 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4400 _S_broadcast(bool __x) 4402 if constexpr (__is_avx512_abi<_Abi>()) 4403 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1)) 4404 : _MaskMember<_Tp>(); 4406 return _Base::template _S_broadcast<_Tp>(__x); 4411 template <typename _Tp> 4412 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4413 _S_load(const bool* __mem) 4415 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4416 if (__builtin_is_constant_evaluated()) 4418 if constexpr (__is_avx512_abi<_Abi>()) 4420 _MaskMember<_Tp> __r{}; 4421 for (size_t __i = 0; __i < _S_size<_Tp>; ++__i) 4422 __r._M_data |= _ULLong(__mem[__i]) << __i; 4426 return _Base::template _S_load<_Tp>(__mem); 4428 else if constexpr (__have_avx512bw) 4430 const auto __to_vec_or_bits 4431 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) { 4432 if constexpr (__is_avx512_abi<_Abi>()) 4435 return _S_to_maskvector<_Tp>( 4436 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); 4439 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) 4442 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4443 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a)); 4445 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl) 4448 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4449 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a)); 4451 else if constexpr (_S_size<_Tp> <= 64) 4454 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4455 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a)); 4458 else if constexpr (__is_avx512_abi<_Abi>()) 4460 if constexpr (_S_size<_Tp> <= 8) 4463 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4464 const auto __b = _mm512_cvtepi8_epi64(__a); 4465 return _mm512_test_epi64_mask(__b, __b); 4467 else if constexpr (_S_size<_Tp> <= 16) 4470 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4471 const auto __b = _mm512_cvtepi8_epi32(__a); 4472 return _mm512_test_epi32_mask(__b, __b); 4474 else if constexpr (_S_size<_Tp> <= 32) 4477 __builtin_memcpy(&__a, __mem, 16); 4478 const auto __b = _mm512_cvtepi8_epi32(__a); 4479 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16); 4480 const auto __c = _mm512_cvtepi8_epi32(__a); 4481 return _mm512_test_epi32_mask(__b, __b) 4482 | (_mm512_test_epi32_mask(__c, __c) << 16); 4484 else if constexpr (_S_size<_Tp> <= 64) 4487 __builtin_memcpy(&__a, __mem, 16); 4488 const auto __b = _mm512_cvtepi8_epi32(__a); 4489 __builtin_memcpy(&__a, __mem + 16, 16); 4490 const auto __c = _mm512_cvtepi8_epi32(__a); 4491 if constexpr (_S_size<_Tp> <= 48) 4493 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32); 4494 const auto __d = _mm512_cvtepi8_epi32(__a); 4495 return _mm512_test_epi32_mask(__b, __b) 4496 | (_mm512_test_epi32_mask(__c, __c) << 16) 4497 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32); 4501 __builtin_memcpy(&__a, __mem + 16, 16); 4502 const auto __d = _mm512_cvtepi8_epi32(__a); 4503 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48); 4504 const auto __e = _mm512_cvtepi8_epi32(__a); 4505 return _mm512_test_epi32_mask(__b, __b) 4506 | (_mm512_test_epi32_mask(__c, __c) << 16) 4507 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32) 4508 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48); 4512 __assert_unreachable<_Tp>(); 4514 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2) 4515 return __vector_bitcast<_Tp>( 4516 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]), 4517 -int(__mem[1]), -int(__mem[1])}); 4518 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx) 4521 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>); 4522 const auto __k = __to_intrin( 4523 (__vector_broadcast<4>(__bool4) 4524 & __make_vector<int>(0x1, 0x100, 0x10000, 4525 _S_size<_Tp> == 4 ? 0x1000000 : 0)) 4527 return __vector_bitcast<_Tp>( 4528 __concat(_mm_unpacklo_epi32(__k, __k), 4529 _mm_unpackhi_epi32(__k, __k))); 4531 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4) 4534 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>); 4535 if constexpr (__have_sse2) 4537 __m128i __k = _mm_cvtsi32_si128(__bools); 4538 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4539 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4540 _mm_unpacklo_epi16(__k, __k)); 4544 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools)); 4546 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4547 _mm_cmpgt_ps(__k, __m128())); 4550 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8) 4553 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4554 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i()); 4555 return __vector_bitcast<_Tp>( 4556 __concat(_mm_unpacklo_epi16(__k, __k), 4557 _mm_unpackhi_epi16(__k, __k))); 4559 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16) 4562 __builtin_memcpy(&__k, __mem, _S_size<_Tp>); 4563 __k = _mm_cmpgt_epi8(__k, __m128i()); 4564 if constexpr (_S_size<_Tp> <= 8) 4565 return __vector_bitcast<_Tp, _S_size<_Tp>>( 4566 _mm_unpacklo_epi8(__k, __k)); 4568 return __concat(_mm_unpacklo_epi8(__k, __k), 4569 _mm_unpackhi_epi8(__k, __k)); 4572 return _Base::template _S_load<_Tp>(__mem); 4576 // _S_from_bitmask{{{ 4577 template <size_t _Np, typename _Tp> 4578 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 4579 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) 4581 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4582 if constexpr (__is_avx512_abi<_Abi>()) 4583 return __bits._M_to_bits(); 4585 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); 4589 // _S_masked_load {{{2 4590 template <typename _Tp, size_t _Np> 4591 static inline _SimdWrapper<_Tp, _Np> 4592 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, 4593 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept 4595 if constexpr (__is_avx512_abi<_Abi>()) 4597 if constexpr (__have_avx512bw_vl) 4599 if constexpr (_Np <= 16) 4602 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem); 4603 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a); 4605 else if constexpr (_Np <= 32) 4608 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem); 4609 return (__merge & ~__mask) 4610 | _mm256_test_epi8_mask(__a, __a); 4612 else if constexpr (_Np <= 64) 4615 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem); 4616 return (__merge & ~__mask) 4617 | _mm512_test_epi8_mask(__a, __a); 4620 __assert_unreachable<_Tp>(); 4624 _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4625 __merge._M_set(__i, __mem[__i]); 4630 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1) 4632 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4633 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(), 4634 _mm256_mask_loadu_epi8(__m256i(), 4637 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1) 4639 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4641 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k, 4643 _mm_mask_loadu_epi8(__m128i(), __k, __mem)); 4645 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2) 4647 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4648 __merge = _mm256_mask_sub_epi16( 4649 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4650 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4652 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2) 4654 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4655 __merge = _mm_mask_sub_epi16( 4656 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4657 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem))); 4659 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4) 4661 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4662 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32( 4663 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4664 _mm256_cvtepi8_epi32( 4665 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4667 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4) 4669 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4670 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32( 4671 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4672 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4674 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8) 4676 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4677 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64( 4678 __vector_bitcast<_LLong>(__merge), __k, __m256i(), 4679 _mm256_cvtepi8_epi64( 4680 _mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4682 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8) 4684 const auto __k = _S_to_bits(__mask)._M_to_bits(); 4685 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64( 4686 __vector_bitcast<_LLong>(__merge), __k, __m128i(), 4687 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem)))); 4690 return _Base::_S_masked_load(__merge, __mask, __mem); 4695 template <typename _Tp, size_t _Np> 4696 _GLIBCXX_SIMD_INTRINSIC static constexpr void 4697 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept 4699 if (__builtin_is_constant_evaluated()) 4700 _Base::_S_store(__v, __mem); 4701 else if constexpr (__is_avx512_abi<_Abi>()) 4703 if constexpr (__have_avx512bw_vl) 4704 _CommonImplX86::_S_store<_Np>( 4705 __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4706 if constexpr (_Np <= 16) 4707 return _mm_maskz_set1_epi8(__data, 1); 4708 else if constexpr (_Np <= 32) 4709 return _mm256_maskz_set1_epi8(__data, 1); 4711 return _mm512_maskz_set1_epi8(__data, 1); 4714 else if constexpr (_Np <= 8) 4715 _CommonImplX86::_S_store<_Np>( 4716 __vector_bitcast<char>( 4717 #if defined __x86_64__ 4718 __make_wrapper<_ULLong>( 4719 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull) 4721 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U), 4722 _pdep_u32(__v._M_data >> 4, 4727 else if constexpr (_Np <= 16) 4728 _mm512_mask_cvtepi32_storeu_epi8( 4729 __mem, 0xffffu >> (16 - _Np), 4730 _mm512_maskz_set1_epi32(__v._M_data, 1)); 4732 __assert_unreachable<_Tp>(); 4734 else if constexpr (__is_sse_abi<_Abi>()) //{{{ 4736 if constexpr (_Np == 2 && sizeof(_Tp) == 8) 4738 const auto __k = __vector_bitcast<int>(__v); 4742 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 4744 if constexpr (__have_sse2) 4746 const unsigned __bool4 4747 = __vector_bitcast<_UInt>(_mm_packs_epi16( 4748 _mm_packs_epi32(__intrin_bitcast<__m128i>( 4753 __builtin_memcpy(__mem, &__bool4, _Np); 4755 else if constexpr (__have_mmx) 4757 const __m64 __k = _mm_cvtps_pi8( 4758 __and(__to_intrin(__v), _mm_set1_ps(1.f))); 4759 __builtin_memcpy(__mem, &__k, _Np); 4763 return _Base::_S_store(__v, __mem); 4765 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 4767 _CommonImplX86::_S_store<_Np>( 4768 __vector_bitcast<char>(_mm_packs_epi16( 4769 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15), 4773 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 4774 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem); 4776 __assert_unreachable<_Tp>(); 4778 else if constexpr (__is_avx_abi<_Abi>()) // {{{ 4780 if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 4782 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4784 if constexpr (__have_avx2) 4785 __bool4 = _mm256_movemask_epi8(__k); 4787 __bool4 = (_mm_movemask_epi8(__lo128(__k)) 4788 | (_mm_movemask_epi8(__hi128(__k)) << 16)); 4789 __bool4 &= 0x01010101; 4790 __builtin_memcpy(__mem, &__bool4, _Np); 4792 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4) 4794 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v)); 4796 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)), 4799 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i())); 4800 _CommonImplX86::_S_store<_Np>(__k3, __mem); 4802 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2) 4804 if constexpr (__have_avx2) 4806 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15); 4807 const auto __bools = __vector_bitcast<char>( 4808 _mm_packs_epi16(__lo128(__x), __hi128(__x))); 4809 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4815 & __vector_bitcast<_UChar>( 4816 _mm_packs_epi16(__lo128(__to_intrin(__v)), 4817 __hi128(__to_intrin(__v)))); 4818 _CommonImplX86::_S_store<_Np>(__bools, __mem); 4821 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1) 4822 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem); 4824 __assert_unreachable<_Tp>(); 4827 __assert_unreachable<_Tp>(); 4830 // _S_masked_store {{{2 4831 template <typename _Tp, size_t _Np> 4833 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, 4834 const _SimdWrapper<_Tp, _Np> __k) noexcept 4836 if constexpr (__is_avx512_abi<_Abi>()) 4838 static_assert(is_same_v<_Tp, bool>); 4839 if constexpr (_Np <= 16 && __have_avx512bw_vl) 4840 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1)); 4841 else if constexpr (_Np <= 16) 4842 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k, 4843 _mm512_maskz_set1_epi32(__v, 1)); 4844 else if constexpr (_Np <= 32 && __have_avx512bw_vl) 4845 _mm256_mask_storeu_epi8(__mem, __k, 4846 _mm256_maskz_set1_epi8(__v, 1)); 4847 else if constexpr (_Np <= 32 && __have_avx512bw) 4848 _mm256_mask_storeu_epi8(__mem, __k, 4849 __lo256(_mm512_maskz_set1_epi8(__v, 1))); 4850 else if constexpr (_Np <= 64 && __have_avx512bw) 4851 _mm512_mask_storeu_epi8(__mem, __k, 4852 _mm512_maskz_set1_epi8(__v, 1)); 4854 __assert_unreachable<_Tp>(); 4857 _Base::_S_masked_store(__v, __mem, __k); 4860 // logical and bitwise operators {{{2 4861 template <typename _Tp, size_t _Np> 4862 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4863 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4865 if constexpr (is_same_v<_Tp, bool>) 4867 if (__builtin_is_constant_evaluated()) 4868 return __x._M_data & __y._M_data; 4869 else if constexpr (__have_avx512dq && _Np <= 8) 4870 return _kand_mask8(__x._M_data, __y._M_data); 4871 else if constexpr (_Np <= 16) 4872 return _kand_mask16(__x._M_data, __y._M_data); 4873 else if constexpr (__have_avx512bw && _Np <= 32) 4874 return _kand_mask32(__x._M_data, __y._M_data); 4875 else if constexpr (__have_avx512bw && _Np <= 64) 4876 return _kand_mask64(__x._M_data, __y._M_data); 4878 __assert_unreachable<_Tp>(); 4881 return _Base::_S_logical_and(__x, __y); 4884 template <typename _Tp, size_t _Np> 4885 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4886 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4888 if constexpr (is_same_v<_Tp, bool>) 4890 if (__builtin_is_constant_evaluated()) 4891 return __x._M_data | __y._M_data; 4892 else if constexpr (__have_avx512dq && _Np <= 8) 4893 return _kor_mask8(__x._M_data, __y._M_data); 4894 else if constexpr (_Np <= 16) 4895 return _kor_mask16(__x._M_data, __y._M_data); 4896 else if constexpr (__have_avx512bw && _Np <= 32) 4897 return _kor_mask32(__x._M_data, __y._M_data); 4898 else if constexpr (__have_avx512bw && _Np <= 64) 4899 return _kor_mask64(__x._M_data, __y._M_data); 4901 __assert_unreachable<_Tp>(); 4904 return _Base::_S_logical_or(__x, __y); 4907 template <typename _Tp, size_t _Np> 4908 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4909 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x) 4911 if constexpr (is_same_v<_Tp, bool>) 4913 if (__builtin_is_constant_evaluated()) 4914 return __x._M_data ^ _Abi::template __implicit_mask_n<_Np>(); 4915 else if constexpr (__have_avx512dq && _Np <= 8) 4916 return _kandn_mask8(__x._M_data, 4917 _Abi::template __implicit_mask_n<_Np>()); 4918 else if constexpr (_Np <= 16) 4919 return _kandn_mask16(__x._M_data, 4920 _Abi::template __implicit_mask_n<_Np>()); 4921 else if constexpr (__have_avx512bw && _Np <= 32) 4922 return _kandn_mask32(__x._M_data, 4923 _Abi::template __implicit_mask_n<_Np>()); 4924 else if constexpr (__have_avx512bw && _Np <= 64) 4925 return _kandn_mask64(__x._M_data, 4926 _Abi::template __implicit_mask_n<_Np>()); 4928 __assert_unreachable<_Tp>(); 4931 return _Base::_S_bit_not(__x); 4934 template <typename _Tp, size_t _Np> 4935 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4936 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4938 if constexpr (is_same_v<_Tp, bool>) 4940 if (__builtin_is_constant_evaluated()) 4941 return __x._M_data & __y._M_data; 4942 else if constexpr (__have_avx512dq && _Np <= 8) 4943 return _kand_mask8(__x._M_data, __y._M_data); 4944 else if constexpr (_Np <= 16) 4945 return _kand_mask16(__x._M_data, __y._M_data); 4946 else if constexpr (__have_avx512bw && _Np <= 32) 4947 return _kand_mask32(__x._M_data, __y._M_data); 4948 else if constexpr (__have_avx512bw && _Np <= 64) 4949 return _kand_mask64(__x._M_data, __y._M_data); 4951 __assert_unreachable<_Tp>(); 4954 return _Base::_S_bit_and(__x, __y); 4957 template <typename _Tp, size_t _Np> 4958 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4959 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4961 if constexpr (is_same_v<_Tp, bool>) 4963 if (__builtin_is_constant_evaluated()) 4964 return __x._M_data | __y._M_data; 4965 else if constexpr (__have_avx512dq && _Np <= 8) 4966 return _kor_mask8(__x._M_data, __y._M_data); 4967 else if constexpr (_Np <= 16) 4968 return _kor_mask16(__x._M_data, __y._M_data); 4969 else if constexpr (__have_avx512bw && _Np <= 32) 4970 return _kor_mask32(__x._M_data, __y._M_data); 4971 else if constexpr (__have_avx512bw && _Np <= 64) 4972 return _kor_mask64(__x._M_data, __y._M_data); 4974 __assert_unreachable<_Tp>(); 4977 return _Base::_S_bit_or(__x, __y); 4980 template <typename _Tp, size_t _Np> 4981 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 4982 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 4984 if constexpr (is_same_v<_Tp, bool>) 4986 if (__builtin_is_constant_evaluated()) 4987 return __x._M_data ^ __y._M_data; 4988 else if constexpr (__have_avx512dq && _Np <= 8) 4989 return _kxor_mask8(__x._M_data, __y._M_data); 4990 else if constexpr (_Np <= 16) 4991 return _kxor_mask16(__x._M_data, __y._M_data); 4992 else if constexpr (__have_avx512bw && _Np <= 32) 4993 return _kxor_mask32(__x._M_data, __y._M_data); 4994 else if constexpr (__have_avx512bw && _Np <= 64) 4995 return _kxor_mask64(__x._M_data, __y._M_data); 4997 __assert_unreachable<_Tp>(); 5000 return _Base::_S_bit_xor(__x, __y); 5004 // _S_masked_assign{{{ 5005 template <size_t _Np> 5006 _GLIBCXX_SIMD_INTRINSIC static void 5007 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 5008 _SimdWrapper<bool, _Np>& __lhs, _SimdWrapper<bool, _Np> __rhs) 5011 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data); 5014 template <size_t _Np> 5015 _GLIBCXX_SIMD_INTRINSIC static void 5016 _S_masked_assign(_SimdWrapper<bool, _Np> __k, 5017 _SimdWrapper<bool, _Np>& __lhs, bool __rhs) 5020 __lhs._M_data = __k._M_data | __lhs._M_data; 5022 __lhs._M_data = ~__k._M_data & __lhs._M_data; 5025 using _MaskImplBuiltin<_Abi>::_S_masked_assign; 5029 template <typename _Tp> 5030 _GLIBCXX_SIMD_INTRINSIC static bool 5031 _S_all_of(simd_mask<_Tp, _Abi> __k) 5033 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5035 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5036 using _TI = __intrinsic_type_t<_Tp, _Np>; 5037 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5038 if constexpr (__have_sse4_1) 5040 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5041 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5042 return 0 != __testc(__a, __b); 5044 else if constexpr (is_same_v<_Tp, float>) 5045 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) 5047 else if constexpr (is_same_v<_Tp, double>) 5048 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) 5051 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 5052 == (1 << (_Np * sizeof(_Tp))) - 1; 5054 else if constexpr (__is_avx512_abi<_Abi>()) 5056 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>(); 5057 const auto __kk = __k._M_data._M_data; 5058 if constexpr (sizeof(__kk) == 1) 5060 if constexpr (__have_avx512dq) 5061 return _kortestc_mask8_u8(__kk, _Mask == 0xff 5063 : __mmask8(~_Mask)); 5065 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask)); 5067 else if constexpr (sizeof(__kk) == 2) 5068 return _kortestc_mask16_u8(__kk, _Mask == 0xffff 5070 : __mmask16(~_Mask)); 5071 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw) 5072 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU 5074 : __mmask32(~_Mask)); 5075 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw) 5076 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL 5078 : __mmask64(~_Mask)); 5080 __assert_unreachable<_Tp>(); 5086 template <typename _Tp> 5087 _GLIBCXX_SIMD_INTRINSIC static bool 5088 _S_any_of(simd_mask<_Tp, _Abi> __k) 5090 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5092 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5093 using _TI = __intrinsic_type_t<_Tp, _Np>; 5094 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5095 if constexpr (__have_sse4_1) 5097 if constexpr (_Abi::template _S_is_partial< 5098 _Tp> || sizeof(__k) < 16) 5100 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5101 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5102 return 0 == __testz(__a, __b); 5105 return 0 == __testz(__a, __a); 5107 else if constexpr (is_same_v<_Tp, float>) 5108 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0; 5109 else if constexpr (is_same_v<_Tp, double>) 5110 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0; 5112 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1)) 5115 else if constexpr (__is_avx512_abi<_Abi>()) 5116 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 5122 template <typename _Tp> 5123 _GLIBCXX_SIMD_INTRINSIC static bool 5124 _S_none_of(simd_mask<_Tp, _Abi> __k) 5126 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5128 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5129 using _TI = __intrinsic_type_t<_Tp, _Np>; 5130 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5131 if constexpr (__have_sse4_1) 5133 if constexpr (_Abi::template _S_is_partial< 5134 _Tp> || sizeof(__k) < 16) 5136 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5137 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5138 return 0 != __testz(__a, __b); 5141 return 0 != __testz(__a, __a); 5143 else if constexpr (is_same_v<_Tp, float>) 5144 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 5145 else if constexpr (is_same_v<_Tp, double>) 5146 return (__movemask(__a) & ((1 << _Np) - 1)) == 0; 5148 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1)) 5151 else if constexpr (__is_avx512_abi<_Abi>()) 5152 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>()) 5158 template <typename _Tp> 5159 _GLIBCXX_SIMD_INTRINSIC static bool 5160 _S_some_of(simd_mask<_Tp, _Abi> __k) 5162 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>()) 5164 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5165 using _TI = __intrinsic_type_t<_Tp, _Np>; 5166 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k))); 5167 if constexpr (__have_sse4_1) 5169 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b 5170 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 5171 return 0 != __testnzc(__a, __b); 5173 else if constexpr (is_same_v<_Tp, float>) 5175 constexpr int __allbits = (1 << _Np) - 1; 5176 const auto __tmp = _mm_movemask_ps(__a) & __allbits; 5177 return __tmp > 0 && __tmp < __allbits; 5179 else if constexpr (is_same_v<_Tp, double>) 5181 constexpr int __allbits = (1 << _Np) - 1; 5182 const auto __tmp = _mm_movemask_pd(__a) & __allbits; 5183 return __tmp > 0 && __tmp < __allbits; 5187 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1; 5188 const auto __tmp = _mm_movemask_epi8(__a) & __allbits; 5189 return __tmp > 0 && __tmp < __allbits; 5192 else if constexpr (__is_avx512_abi<_Abi>()) 5193 return _S_any_of(__k) && !_S_all_of(__k); 5195 __assert_unreachable<_Tp>(); 5200 template <typename _Tp> 5201 _GLIBCXX_SIMD_INTRINSIC static int 5202 _S_popcount(simd_mask<_Tp, _Abi> __k) 5204 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 5205 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data; 5206 if constexpr (__is_avx512_abi<_Abi>()) 5208 if constexpr (_Np > 32) 5209 return __builtin_popcountll(__kk); 5211 return __builtin_popcount(__kk); 5215 if constexpr (__have_popcnt) 5218 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk))); 5219 const int __count = __builtin_popcount(__bits); 5220 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count; 5222 else if constexpr (_Np == 2 && sizeof(_Tp) == 8) 5224 const int mask = _mm_movemask_pd(__auto_bitcast(__kk)); 5225 return mask - (mask >> 1); 5227 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8) 5229 auto __x = -(__lo128(__kk) + __hi128(__kk)); 5230 return __x[0] + __x[1]; 5232 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4) 5234 if constexpr (__have_sse2) 5236 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk)); 5237 __x = _mm_add_epi32( 5238 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5239 __x = _mm_add_epi32( 5240 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2))); 5241 return -_mm_cvtsi128_si32(__x); 5244 return __builtin_popcount( 5245 _mm_movemask_ps(__auto_bitcast(__kk))); 5247 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2) 5249 auto __x = __to_intrin(__kk); 5250 __x = _mm_add_epi16(__x, 5251 _mm_shuffle_epi32(__x, 5252 _MM_SHUFFLE(0, 1, 2, 3))); 5253 __x = _mm_add_epi16( 5254 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3))); 5255 __x = _mm_add_epi16( 5256 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1))); 5257 return -short(_mm_extract_epi16(__x, 0)); 5259 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1) 5261 auto __x = __to_intrin(__kk); 5262 __x = _mm_add_epi8(__x, 5263 _mm_shuffle_epi32(__x, 5264 _MM_SHUFFLE(0, 1, 2, 3))); 5265 __x = _mm_add_epi8(__x, 5266 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 5268 __x = _mm_add_epi8(__x, 5269 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 5271 auto __y = -__vector_bitcast<_UChar>(__x); 5272 if constexpr (__have_sse4_1) 5273 return __y[0] + __y[1]; 5276 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0); 5277 return (__z & 0xff) + (__z >> 8); 5280 else if constexpr (sizeof(__kk) == 32) 5282 // The following works only as long as the implementations above 5284 using _I = __int_for_sizeof_t<_Tp>; 5285 const auto __as_int = __vector_bitcast<_I>(__kk); 5286 _MaskImplX86<simd_abi::__sse>::_S_popcount( 5287 simd_mask<_I, simd_abi::__sse>(__private_init, 5289 + __hi128(__as_int))); 5292 __assert_unreachable<_Tp>(); 5297 // _S_find_first_set {{{ 5298 template <typename _Tp> 5299 _GLIBCXX_SIMD_INTRINSIC static int 5300 _S_find_first_set(simd_mask<_Tp, _Abi> __k) 5302 if constexpr (__is_avx512_abi<_Abi>()) 5303 return std::__countr_zero(__k._M_data._M_data); 5305 return _Base::_S_find_first_set(__k); 5309 // _S_find_last_set {{{ 5310 template <typename _Tp> 5311 _GLIBCXX_SIMD_INTRINSIC static int 5312 _S_find_last_set(simd_mask<_Tp, _Abi> __k) 5314 if constexpr (__is_avx512_abi<_Abi>()) 5315 return std::__bit_width(__k._M_data._M_data) - 1; 5317 return _Base::_S_find_last_set(__k); 5325 _GLIBCXX_SIMD_END_NAMESPACE 5326 #endif // __cplusplus >= 201703L 5327 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 5329 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80 typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.