libstdc++
simd_builtin.h
1 // Simd Abi specific implementations -*- C++ -*-
2 
3 // Copyright (C) 2020-2023 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
26 #define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
27 
28 #if __cplusplus >= 201703L
29 
30 #include <array>
31 #include <cmath>
32 #include <cstdlib>
33 
34 _GLIBCXX_SIMD_BEGIN_NAMESPACE
35 // _S_allbits{{{
36 template <typename _V>
37  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits
38  = reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());
39 
40 // }}}
41 // _S_signmask, _S_absmask{{{
42 template <typename _V, typename = _VectorTraits<_V>>
43  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask
44  = __xor(_V() + 1, _V() - 1);
45 
46 template <typename _V, typename = _VectorTraits<_V>>
47  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask
48  = __andnot(_S_signmask<_V>, _S_allbits<_V>);
49 
50 //}}}
51 // __vector_permute<Indices...>{{{
52 // Index == -1 requests zeroing of the output element
53 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
54  typename = __detail::__odr_helper>
55  constexpr _Tp
56  __vector_permute(_Tp __x)
57  {
58  static_assert(sizeof...(_Indices) == _TVT::_S_full_size);
59  return __make_vector<typename _TVT::value_type>(
60  (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...);
61  }
62 
63 // }}}
64 // __vector_shuffle<Indices...>{{{
65 // Index == -1 requests zeroing of the output element
66 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
67  typename = __detail::__odr_helper>
68  constexpr _Tp
69  __vector_shuffle(_Tp __x, _Tp __y)
70  {
71  return _Tp{(_Indices == -1 ? 0
72  : _Indices < _TVT::_S_full_size
73  ? __x[_Indices]
74  : __y[_Indices - _TVT::_S_full_size])...};
75  }
76 
77 // }}}
78 // __make_wrapper{{{
79 template <typename _Tp, typename... _Args>
80  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)>
81  __make_wrapper(const _Args&... __args)
82  { return __make_vector<_Tp>(__args...); }
83 
84 // }}}
85 // __wrapper_bitcast{{{
86 template <typename _Tp, size_t _ToN = 0, typename _Up, size_t _M,
87  size_t _Np = _ToN != 0 ? _ToN : sizeof(_Up) * _M / sizeof(_Tp)>
88  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np>
89  __wrapper_bitcast(_SimdWrapper<_Up, _M> __x)
90  {
91  static_assert(_Np > 1);
92  return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data);
93  }
94 
95 // }}}
96 // __shift_elements_right{{{
97 // if (__shift % 2ⁿ == 0) => the low n Bytes are correct
98 template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
99  _GLIBCXX_SIMD_INTRINSIC _Tp
100  __shift_elements_right(_Tp __v)
101  {
102  [[maybe_unused]] const auto __iv = __to_intrin(__v);
103  static_assert(__shift <= sizeof(_Tp));
104  if constexpr (__shift == 0)
105  return __v;
106  else if constexpr (__shift == sizeof(_Tp))
107  return _Tp();
108 #if _GLIBCXX_SIMD_X86INTRIN // {{{
109  else if constexpr (__have_sse && __shift == 8
110  && _TVT::template _S_is<float, 4>)
111  return _mm_movehl_ps(__iv, __iv);
112  else if constexpr (__have_sse2 && __shift == 8
113  && _TVT::template _S_is<double, 2>)
114  return _mm_unpackhi_pd(__iv, __iv);
115  else if constexpr (__have_sse2 && sizeof(_Tp) == 16)
116  return reinterpret_cast<typename _TVT::type>(
117  _mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift));
118  else if constexpr (__shift == 16 && sizeof(_Tp) == 32)
119  {
120  /*if constexpr (__have_avx && _TVT::template _S_is<double, 4>)
121  return _mm256_permute2f128_pd(__iv, __iv, 0x81);
122  else if constexpr (__have_avx && _TVT::template _S_is<float, 8>)
123  return _mm256_permute2f128_ps(__iv, __iv, 0x81);
124  else if constexpr (__have_avx)
125  return reinterpret_cast<typename _TVT::type>(
126  _mm256_permute2f128_si256(__iv, __iv, 0x81));
127  else*/
128  return __zero_extend(__hi128(__v));
129  }
130  else if constexpr (__have_avx2 && sizeof(_Tp) == 32 && __shift < 16)
131  {
132  const auto __vll = __vector_bitcast<_LLong>(__v);
133  return reinterpret_cast<typename _TVT::type>(
134  _mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, 0x81),
135  __vll, __shift));
136  }
137  else if constexpr (__have_avx && sizeof(_Tp) == 32 && __shift < 16)
138  {
139  const auto __vll = __vector_bitcast<_LLong>(__v);
140  return reinterpret_cast<typename _TVT::type>(
141  __concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift),
142  _mm_srli_si128(__hi128(__vll), __shift)));
143  }
144  else if constexpr (sizeof(_Tp) == 32 && __shift > 16)
145  return __zero_extend(__shift_elements_right<__shift - 16>(__hi128(__v)));
146  else if constexpr (sizeof(_Tp) == 64 && __shift == 32)
147  return __zero_extend(__hi256(__v));
148  else if constexpr (__have_avx512f && sizeof(_Tp) == 64)
149  {
150  if constexpr (__shift >= 48)
151  return __zero_extend(
152  __shift_elements_right<__shift - 48>(__extract<3, 4>(__v)));
153  else if constexpr (__shift >= 32)
154  return __zero_extend(
155  __shift_elements_right<__shift - 32>(__hi256(__v)));
156  else if constexpr (__shift % 8 == 0)
157  return reinterpret_cast<typename _TVT::type>(
158  _mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v),
159  __shift / 8));
160  else if constexpr (__shift % 4 == 0)
161  return reinterpret_cast<typename _TVT::type>(
162  _mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v),
163  __shift / 4));
164  else if constexpr (__have_avx512bw && __shift < 16)
165  {
166  const auto __vll = __vector_bitcast<_LLong>(__v);
167  return reinterpret_cast<typename _TVT::type>(
168  _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, 0xf9),
169  __vll, __shift));
170  }
171  else if constexpr (__have_avx512bw && __shift < 32)
172  {
173  const auto __vll = __vector_bitcast<_LLong>(__v);
174  return reinterpret_cast<typename _TVT::type>(
175  _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), 0xee),
176  _mm512_shuffle_i32x4(__vll, __vll, 0xf9),
177  __shift - 16));
178  }
179  else
180  __assert_unreachable<_Tp>();
181  }
182  /*
183  } else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64)
184  return __auto_bitcast(__extract<__shift / 16, 4>(__v));
185  */
186 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
187  else
188  {
189  constexpr int __chunksize = __shift % 8 == 0 ? 8
190  : __shift % 4 == 0 ? 4
191  : __shift % 2 == 0 ? 2
192  : 1;
193  auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v);
194  using _Up = decltype(__w);
195  return __intrin_bitcast<_Tp>(
196  __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>(
197  [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
198  return _Up{__chunks...};
199  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
200  return __w[__shift / __chunksize + __i];
201  }));
202  }
203  }
204 
205 // }}}
206 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{
207 template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
208  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
209  _SimdWrapper<_Tp, _Np / _Total * _Combine>
210  __extract_part(const _SimdWrapper<_Tp, _Np> __x)
211  {
212  if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0)
213  return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x);
214  else
215  {
216  constexpr size_t __values_per_part = _Np / _Total;
217  constexpr size_t __values_to_skip = _Index * __values_per_part;
218  constexpr size_t __return_size = __values_per_part * _Combine;
219  using _R = __vector_type_t<_Tp, __return_size>;
220  static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp)
221  <= sizeof(__x),
222  "out of bounds __extract_part");
223  // the following assertion would ensure no "padding" to be read
224  // static_assert(_Total >= _Index + _Combine, "_Total must be greater
225  // than _Index");
226 
227  // static_assert(__return_size * _Total == _Np, "_Np must be divisible
228  // by _Total");
229  if (__x._M_is_constprop())
230  return __generate_from_n_evaluations<__return_size, _R>(
231  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
232  return __x[__values_to_skip + __i];
233  });
234  if constexpr (_Index == 0 && _Total == 1)
235  return __x;
236  else if constexpr (_Index == 0)
237  return __intrin_bitcast<_R>(__as_vector(__x));
238 #if _GLIBCXX_SIMD_X86INTRIN // {{{
239  else if constexpr (sizeof(__x) == 32
240  && __return_size * sizeof(_Tp) <= 16)
241  {
242  constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp);
243  if constexpr (__bytes_to_skip == 16)
244  return __vector_bitcast<_Tp, __return_size>(
245  __hi128(__as_vector(__x)));
246  else
247  return __vector_bitcast<_Tp, __return_size>(
248  _mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)),
249  __lo128(__vector_bitcast<_LLong>(__x)),
250  __bytes_to_skip));
251  }
252 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
253  else if constexpr (_Index > 0
254  && (__values_to_skip % __return_size != 0
255  || sizeof(_R) >= 8)
256  && (__values_to_skip + __return_size) * sizeof(_Tp)
257  <= 64
258  && sizeof(__x) >= 16)
259  return __intrin_bitcast<_R>(
260  __shift_elements_right<__values_to_skip * sizeof(_Tp)>(
261  __as_vector(__x)));
262  else
263  {
264  _R __r = {};
265  __builtin_memcpy(&__r,
266  reinterpret_cast<const char*>(&__x)
267  + sizeof(_Tp) * __values_to_skip,
268  __return_size * sizeof(_Tp));
269  return __r;
270  }
271  }
272  }
273 
274 // }}}
275 // __extract_part(_SimdWrapper<bool, _Np>) {{{
276 template <int _Index, int _Total, int _Combine = 1, size_t _Np>
277  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<bool, _Np / _Total * _Combine>
278  __extract_part(const _SimdWrapper<bool, _Np> __x)
279  {
280  static_assert(_Combine == 1, "_Combine != 1 not implemented");
281  static_assert(__have_avx512f && _Total >= 2 && _Index + _Combine <= _Total && _Index >= 0);
282  return __x._M_data >> (_Index * _Np / _Total);
283  }
284 
285 // }}}
286 
287 // __vector_convert {{{
288 // implementation requires an index sequence
289 template <typename _To, typename _From, size_t... _I>
290  _GLIBCXX_SIMD_INTRINSIC constexpr _To
291  __vector_convert(_From __a, index_sequence<_I...>)
292  {
293  using _Tp = typename _VectorTraits<_To>::value_type;
294  return _To{static_cast<_Tp>(__a[_I])...};
295  }
296 
297 template <typename _To, typename _From, size_t... _I>
298  _GLIBCXX_SIMD_INTRINSIC constexpr _To
299  __vector_convert(_From __a, _From __b, index_sequence<_I...>)
300  {
301  using _Tp = typename _VectorTraits<_To>::value_type;
302  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...};
303  }
304 
305 template <typename _To, typename _From, size_t... _I>
306  _GLIBCXX_SIMD_INTRINSIC constexpr _To
307  __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>)
308  {
309  using _Tp = typename _VectorTraits<_To>::value_type;
310  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
311  static_cast<_Tp>(__c[_I])...};
312  }
313 
314 template <typename _To, typename _From, size_t... _I>
315  _GLIBCXX_SIMD_INTRINSIC constexpr _To
316  __vector_convert(_From __a, _From __b, _From __c, _From __d,
317  index_sequence<_I...>)
318  {
319  using _Tp = typename _VectorTraits<_To>::value_type;
320  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
321  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...};
322  }
323 
324 template <typename _To, typename _From, size_t... _I>
325  _GLIBCXX_SIMD_INTRINSIC constexpr _To
326  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
327  index_sequence<_I...>)
328  {
329  using _Tp = typename _VectorTraits<_To>::value_type;
330  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
331  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
332  static_cast<_Tp>(__e[_I])...};
333  }
334 
335 template <typename _To, typename _From, size_t... _I>
336  _GLIBCXX_SIMD_INTRINSIC constexpr _To
337  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
338  _From __f, index_sequence<_I...>)
339  {
340  using _Tp = typename _VectorTraits<_To>::value_type;
341  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
342  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
343  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...};
344  }
345 
346 template <typename _To, typename _From, size_t... _I>
347  _GLIBCXX_SIMD_INTRINSIC constexpr _To
348  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
349  _From __f, _From __g, index_sequence<_I...>)
350  {
351  using _Tp = typename _VectorTraits<_To>::value_type;
352  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
353  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
354  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
355  static_cast<_Tp>(__g[_I])...};
356  }
357 
358 template <typename _To, typename _From, size_t... _I>
359  _GLIBCXX_SIMD_INTRINSIC constexpr _To
360  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
361  _From __f, _From __g, _From __h, index_sequence<_I...>)
362  {
363  using _Tp = typename _VectorTraits<_To>::value_type;
364  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
365  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
366  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
367  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...};
368  }
369 
370 template <typename _To, typename _From, size_t... _I>
371  _GLIBCXX_SIMD_INTRINSIC constexpr _To
372  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
373  _From __f, _From __g, _From __h, _From __i,
374  index_sequence<_I...>)
375  {
376  using _Tp = typename _VectorTraits<_To>::value_type;
377  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
378  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
379  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
380  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
381  static_cast<_Tp>(__i[_I])...};
382  }
383 
384 template <typename _To, typename _From, size_t... _I>
385  _GLIBCXX_SIMD_INTRINSIC constexpr _To
386  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
387  _From __f, _From __g, _From __h, _From __i, _From __j,
388  index_sequence<_I...>)
389  {
390  using _Tp = typename _VectorTraits<_To>::value_type;
391  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
392  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
393  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
394  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
395  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...};
396  }
397 
398 template <typename _To, typename _From, size_t... _I>
399  _GLIBCXX_SIMD_INTRINSIC constexpr _To
400  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
401  _From __f, _From __g, _From __h, _From __i, _From __j,
402  _From __k, index_sequence<_I...>)
403  {
404  using _Tp = typename _VectorTraits<_To>::value_type;
405  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
406  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
407  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
408  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
409  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
410  static_cast<_Tp>(__k[_I])...};
411  }
412 
413 template <typename _To, typename _From, size_t... _I>
414  _GLIBCXX_SIMD_INTRINSIC constexpr _To
415  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
416  _From __f, _From __g, _From __h, _From __i, _From __j,
417  _From __k, _From __l, index_sequence<_I...>)
418  {
419  using _Tp = typename _VectorTraits<_To>::value_type;
420  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
421  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
422  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
423  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
424  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
425  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...};
426  }
427 
428 template <typename _To, typename _From, size_t... _I>
429  _GLIBCXX_SIMD_INTRINSIC constexpr _To
430  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
431  _From __f, _From __g, _From __h, _From __i, _From __j,
432  _From __k, _From __l, _From __m, index_sequence<_I...>)
433  {
434  using _Tp = typename _VectorTraits<_To>::value_type;
435  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
436  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
437  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
438  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
439  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
440  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
441  static_cast<_Tp>(__m[_I])...};
442  }
443 
444 template <typename _To, typename _From, size_t... _I>
445  _GLIBCXX_SIMD_INTRINSIC constexpr _To
446  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
447  _From __f, _From __g, _From __h, _From __i, _From __j,
448  _From __k, _From __l, _From __m, _From __n,
449  index_sequence<_I...>)
450  {
451  using _Tp = typename _VectorTraits<_To>::value_type;
452  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
453  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
454  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
455  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
456  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
457  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
458  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...};
459  }
460 
461 template <typename _To, typename _From, size_t... _I>
462  _GLIBCXX_SIMD_INTRINSIC constexpr _To
463  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
464  _From __f, _From __g, _From __h, _From __i, _From __j,
465  _From __k, _From __l, _From __m, _From __n, _From __o,
466  index_sequence<_I...>)
467  {
468  using _Tp = typename _VectorTraits<_To>::value_type;
469  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
470  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
471  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
472  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
473  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
474  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
475  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
476  static_cast<_Tp>(__o[_I])...};
477  }
478 
479 template <typename _To, typename _From, size_t... _I>
480  _GLIBCXX_SIMD_INTRINSIC constexpr _To
481  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
482  _From __f, _From __g, _From __h, _From __i, _From __j,
483  _From __k, _From __l, _From __m, _From __n, _From __o,
484  _From __p, index_sequence<_I...>)
485  {
486  using _Tp = typename _VectorTraits<_To>::value_type;
487  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
488  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
489  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
490  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
491  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
492  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
493  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
494  static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...};
495  }
496 
497 // Defer actual conversion to the overload that takes an index sequence. Note
498 // that this function adds zeros or drops values off the end if you don't ensure
499 // matching width.
500 template <typename _To, typename... _From, size_t _FromSize>
501  _GLIBCXX_SIMD_INTRINSIC constexpr _To
502  __vector_convert(_SimdWrapper<_From, _FromSize>... __xs)
503  {
504 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
505  using _From0 = __first_of_pack_t<_From...>;
506  using _FW = _SimdWrapper<_From0, _FromSize>;
507  if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop()))
508  {
509  if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1))
510  == 0) // power-of-two number of arguments
511  return __convert_x86<_To>(__as_vector(__xs)...);
512  else // append zeros and recurse until the above branch is taken
513  return __vector_convert<_To>(__xs..., _FW{});
514  }
515  else
516 #endif
517  return __vector_convert<_To>(
518  __as_vector(__xs)...,
519  make_index_sequence<(sizeof...(__xs) == 1 ? std::min(
520  _VectorTraits<_To>::_S_full_size, int(_FromSize))
521  : _FromSize)>());
522  }
523 
524 // }}}
525 // __convert function{{{
526 template <typename _To, typename _From, typename... _More>
527  _GLIBCXX_SIMD_INTRINSIC constexpr auto
528  __convert(_From __v0, _More... __vs)
529  {
530  static_assert((true && ... && is_same_v<_From, _More>) );
531  if constexpr (__is_vectorizable_v<_From>)
532  {
533  using _V = typename _VectorTraits<_To>::type;
534  using _Tp = typename _VectorTraits<_To>::value_type;
535  return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...};
536  }
537  else if constexpr (__is_vector_type_v<_From>)
538  return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...);
539  else // _SimdWrapper arguments
540  {
541  constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More));
542  if constexpr (__is_vectorizable_v<_To>)
543  return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...);
544  else if constexpr (!__is_vector_type_v<_To>)
545  return _To(__convert<typename _To::_BuiltinType>(__v0, __vs...));
546  else
547  {
548  static_assert(
549  sizeof...(_More) == 0
550  || _VectorTraits<_To>::_S_full_size >= __input_size,
551  "__convert(...) requires the input to fit into the output");
552  return __vector_convert<_To>(__v0, __vs...);
553  }
554  }
555  }
556 
557 // }}}
558 // __convert_all{{{
559 // Converts __v into array<_To, N>, where N is _NParts if non-zero or
560 // otherwise deduced from _To such that N * #elements(_To) <= #elements(__v).
561 // Note: this function may return less than all converted elements
562 template <typename _To,
563  size_t _NParts = 0, // allows to convert fewer or more (only last
564  // _To, to be partially filled) than all
565  size_t _Offset = 0, // where to start, # of elements (not Bytes or
566  // Parts)
567  typename _From, typename _FromVT = _VectorTraits<_From>>
568  _GLIBCXX_SIMD_INTRINSIC auto
569  __convert_all(_From __v)
570  {
571  if constexpr (is_arithmetic_v<_To> && _NParts != 1)
572  {
573  static_assert(_Offset < _FromVT::_S_full_size);
574  constexpr auto _Np
575  = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts;
576  return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
577  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
578  return static_cast<_To>(__v[__i + _Offset]);
579  });
580  }
581  else
582  {
583  static_assert(__is_vector_type_v<_To>);
584  using _ToVT = _VectorTraits<_To>;
585  if constexpr (__is_vector_type_v<_From>)
586  return __convert_all<_To, _NParts>(__as_wrapper(__v));
587  else if constexpr (_NParts == 1)
588  {
589  static_assert(_Offset % _ToVT::_S_full_size == 0);
590  return array<_To, 1>{__vector_convert<_To>(
591  __extract_part<_Offset / _ToVT::_S_full_size,
592  __div_roundup(_FromVT::_S_partial_width,
593  _ToVT::_S_full_size)>(__v))};
594  }
595 #if _GLIBCXX_SIMD_X86INTRIN // {{{
596  else if constexpr (!__have_sse4_1 && _Offset == 0
597  && is_integral_v<typename _FromVT::value_type>
598  && sizeof(typename _FromVT::value_type)
599  < sizeof(typename _ToVT::value_type)
600  && !(sizeof(typename _FromVT::value_type) == 4
601  && is_same_v<typename _ToVT::value_type, double>))
602  {
603  using _ToT = typename _ToVT::value_type;
604  using _FromT = typename _FromVT::value_type;
605  constexpr size_t _Np
606  = _NParts != 0
607  ? _NParts
608  : (_FromVT::_S_partial_width / _ToVT::_S_full_size);
609  using _R = array<_To, _Np>;
610  // __adjust modifies its input to have _Np (use _SizeConstant)
611  // entries so that no unnecessary intermediate conversions are
612  // requested and, more importantly, no intermediate conversions are
613  // missing
614  [[maybe_unused]] auto __adjust
615  = [](auto __n,
616  auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> {
617  return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
618  };
619  [[maybe_unused]] const auto __vi = __to_intrin(__v);
620  auto&& __make_array
621  = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
622  if constexpr (_Np == 1)
623  return _R{__intrin_bitcast<_To>(__x0)};
624  else
625  return _R{__intrin_bitcast<_To>(__x0),
626  __intrin_bitcast<_To>(__x1)};
627  };
628 
629  if constexpr (_Np == 0)
630  return _R{};
631  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2)
632  {
633  static_assert(is_integral_v<_FromT>);
634  static_assert(is_integral_v<_ToT>);
635  if constexpr (is_unsigned_v<_FromT>)
636  return __make_array(_mm_unpacklo_epi8(__vi, __m128i()),
637  _mm_unpackhi_epi8(__vi, __m128i()));
638  else
639  return __make_array(
640  _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8),
641  _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8));
642  }
643  else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4)
644  {
645  static_assert(is_integral_v<_FromT>);
646  if constexpr (is_floating_point_v<_ToT>)
647  {
648  const auto __ints
649  = __convert_all<__vector_type16_t<int>, _Np>(
650  __adjust(_SizeConstant<_Np * 4>(), __v));
651  return __generate_from_n_evaluations<_Np, _R>(
652  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
653  return __vector_convert<_To>(__as_wrapper(__ints[__i]));
654  });
655  }
656  else if constexpr (is_unsigned_v<_FromT>)
657  return __make_array(_mm_unpacklo_epi16(__vi, __m128i()),
658  _mm_unpackhi_epi16(__vi, __m128i()));
659  else
660  return __make_array(
661  _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16),
662  _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16));
663  }
664  else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
665  && is_integral_v<_FromT> && is_integral_v<_ToT>)
666  {
667  if constexpr (is_unsigned_v<_FromT>)
668  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
669  _mm_unpackhi_epi32(__vi, __m128i()));
670  else
671  return __make_array(
672  _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
673  _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
674  }
675  else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
676  && is_integral_v<_FromT> && is_integral_v<_ToT>)
677  {
678  if constexpr (is_unsigned_v<_FromT>)
679  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
680  _mm_unpackhi_epi32(__vi, __m128i()));
681  else
682  return __make_array(
683  _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
684  _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
685  }
686  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4
687  && is_signed_v<_FromT>)
688  {
689  const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi),
690  _mm_unpackhi_epi8(__vi, __vi)};
691  const __vector_type_t<int, 4> __vvvv[4] = {
692  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[0], __vv[0])),
693  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[0], __vv[0])),
694  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])),
695  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))};
696  if constexpr (sizeof(_ToT) == 4)
697  return __generate_from_n_evaluations<_Np, _R>(
698  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
699  return __vector_convert<_To>(
700  _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
701  });
702  else if constexpr (is_integral_v<_ToT>)
703  return __generate_from_n_evaluations<_Np, _R>(
704  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
705  const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
706  const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
707  return __vector_bitcast<_ToT>(
708  __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
709  : _mm_unpackhi_epi32(__sx32, __signbits));
710  });
711  else
712  return __generate_from_n_evaluations<_Np, _R>(
713  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
714  const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
715  return __vector_convert<_To>(
716  __i % 2 == 0 ? __int4
717  : _SimdWrapper<int, 4>(
718  _mm_unpackhi_epi64(__to_intrin(__int4),
719  __to_intrin(__int4))));
720  });
721  }
722  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4)
723  {
724  const auto __shorts = __convert_all<__vector_type16_t<
725  conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
726  __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v));
727  return __generate_from_n_evaluations<_Np, _R>(
728  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
729  return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
730  });
731  }
732  else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8
733  && is_signed_v<_FromT> && is_integral_v<_ToT>)
734  {
735  const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi),
736  _mm_unpackhi_epi16(__vi, __vi)};
737  const __vector_type16_t<int> __vvvv[4]
738  = {__vector_bitcast<int>(
739  _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16),
740  _mm_srai_epi32(__vv[0], 31))),
741  __vector_bitcast<int>(
742  _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16),
743  _mm_srai_epi32(__vv[0], 31))),
744  __vector_bitcast<int>(
745  _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16),
746  _mm_srai_epi32(__vv[1], 31))),
747  __vector_bitcast<int>(
748  _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16),
749  _mm_srai_epi32(__vv[1], 31)))};
750  return __generate_from_n_evaluations<_Np, _R>(
751  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
752  return __vector_bitcast<_ToT>(__vvvv[__i]);
753  });
754  }
755  else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8)
756  {
757  const auto __ints
758  = __convert_all<__vector_type16_t<conditional_t<
759  is_signed_v<_FromT> || is_floating_point_v<_ToT>, int,
760  unsigned int>>>(
761  __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v));
762  return __generate_from_n_evaluations<_Np, _R>(
763  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
764  return __convert_all<_To>(__ints[__i / 2])[__i % 2];
765  });
766  }
767  else
768  __assert_unreachable<_To>();
769  }
770 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
771  else if constexpr ((_FromVT::_S_partial_width - _Offset)
772  > _ToVT::_S_full_size)
773  {
774  /*
775  static_assert(
776  (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) ==
777  0,
778  "__convert_all only supports power-of-2 number of elements.
779  Otherwise " "the return type cannot be array<_To, N>.");
780  */
781  constexpr size_t _NTotal
782  = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size;
783  constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts;
784  static_assert(
785  _Np <= _NTotal
786  || (_Np == _NTotal + 1
787  && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size
788  > 0));
789  using _R = array<_To, _Np>;
790  if constexpr (_Np == 1)
791  return _R{__vector_convert<_To>(
792  __extract_part<_Offset, _FromVT::_S_partial_width,
793  _ToVT::_S_full_size>(__v))};
794  else
795  return __generate_from_n_evaluations<_Np, _R>(
796  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
797  auto __part
798  = __extract_part<__i * _ToVT::_S_full_size + _Offset,
799  _FromVT::_S_partial_width,
800  _ToVT::_S_full_size>(__v);
801  return __vector_convert<_To>(__part);
802  });
803  }
804  else if constexpr (_Offset == 0)
805  return array<_To, 1>{__vector_convert<_To>(__v)};
806  else
807  return array<_To, 1>{__vector_convert<_To>(
808  __extract_part<_Offset, _FromVT::_S_partial_width,
809  _FromVT::_S_partial_width - _Offset>(__v))};
810  }
811  }
812 
813 // }}}
814 
815 // _GnuTraits {{{
816 template <typename _Tp, typename _Mp, typename _Abi, size_t _Np>
817  struct _GnuTraits
818  {
819  using _IsValid = true_type;
820  using _SimdImpl = typename _Abi::_SimdImpl;
821  using _MaskImpl = typename _Abi::_MaskImpl;
822 
823  // simd and simd_mask member types {{{
824  using _SimdMember = _SimdWrapper<_Tp, _Np>;
825  using _MaskMember = _SimdWrapper<_Mp, _Np>;
826  static constexpr size_t _S_simd_align = alignof(_SimdMember);
827  static constexpr size_t _S_mask_align = alignof(_MaskMember);
828 
829  // }}}
830  // size metadata {{{
831  static constexpr size_t _S_full_size = _SimdMember::_S_full_size;
832  static constexpr bool _S_is_partial = _SimdMember::_S_is_partial;
833 
834  // }}}
835  // _SimdBase / base class for simd, providing extra conversions {{{
836  struct _SimdBase2
837  {
838  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
839  operator __intrinsic_type_t<_Tp, _Np>() const
840  { return __to_intrin(static_cast<const simd<_Tp, _Abi>*>(this)->_M_data); }
841 
842  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
843  operator __vector_type_t<_Tp, _Np>() const
844  { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); }
845  };
846 
847  struct _SimdBase1
848  {
849  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
850  operator __intrinsic_type_t<_Tp, _Np>() const
851  { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); }
852  };
853 
854  using _SimdBase = conditional_t<
855  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
856  _SimdBase1, _SimdBase2>;
857 
858  // }}}
859  // _MaskBase {{{
860  struct _MaskBase2
861  {
862  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
863  operator __intrinsic_type_t<_Tp, _Np>() const
864  { return static_cast<const simd_mask<_Tp, _Abi>*>(this) ->_M_data.__intrin(); }
865 
866  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
867  operator __vector_type_t<_Tp, _Np>() const
868  { return static_cast<const simd_mask<_Tp, _Abi>*>(this)->_M_data._M_data; }
869  };
870 
871  struct _MaskBase1
872  {
873  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
874  operator __intrinsic_type_t<_Tp, _Np>() const
875  { return __data(*static_cast<const simd_mask<_Tp, _Abi>*>(this)); }
876  };
877 
878  using _MaskBase = conditional_t<
879  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
880  _MaskBase1, _MaskBase2>;
881 
882  // }}}
883  // _MaskCastType {{{
884  // parameter type of one explicit simd_mask constructor
885  class _MaskCastType
886  {
887  using _Up = __intrinsic_type_t<_Tp, _Np>;
888  _Up _M_data;
889 
890  public:
891  _GLIBCXX_SIMD_ALWAYS_INLINE
892  _MaskCastType(_Up __x) : _M_data(__x) {}
893 
894  _GLIBCXX_SIMD_ALWAYS_INLINE
895  operator _MaskMember() const { return _M_data; }
896  };
897 
898  // }}}
899  // _SimdCastType {{{
900  // parameter type of one explicit simd constructor
901  class _SimdCastType1
902  {
903  using _Ap = __intrinsic_type_t<_Tp, _Np>;
904  _SimdMember _M_data;
905 
906  public:
907  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
908  _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
909 
910  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
911  operator _SimdMember() const { return _M_data; }
912  };
913 
914  class _SimdCastType2
915  {
916  using _Ap = __intrinsic_type_t<_Tp, _Np>;
917  using _Bp = __vector_type_t<_Tp, _Np>;
918  _SimdMember _M_data;
919 
920  public:
921  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
922  _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
923 
924  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
925  _SimdCastType2(_Bp __b) : _M_data(__b) {}
926 
927  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
928  operator _SimdMember() const { return _M_data; }
929  };
930 
931  using _SimdCastType = conditional_t<
932  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
933  _SimdCastType1, _SimdCastType2>;
934  //}}}
935  };
936 
937 // }}}
938 struct _CommonImplX86;
939 struct _CommonImplNeon;
940 struct _CommonImplBuiltin;
941 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplBuiltin;
942 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplBuiltin;
943 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplX86;
944 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplX86;
945 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplNeon;
946 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplNeon;
947 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplPpc;
948 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplPpc;
949 
950 // simd_abi::_VecBuiltin {{{
951 template <int _UsedBytes>
952  struct simd_abi::_VecBuiltin
953  {
954  template <typename _Tp>
955  static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
956 
957  // validity traits {{{
958  struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
959 
960  template <typename _Tp>
961  struct _IsValidSizeFor
962  : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
963  && _UsedBytes % sizeof(_Tp) == 0
964  && _UsedBytes <= __vectorized_sizeof<_Tp>()
965  && (!__have_avx512f || _UsedBytes <= 32))> {};
966 
967  template <typename _Tp>
968  struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>,
969  _IsValidSizeFor<_Tp>> {};
970 
971  template <typename _Tp>
972  static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
973 
974  // }}}
975  // _SimdImpl/_MaskImpl {{{
976 #if _GLIBCXX_SIMD_X86INTRIN
977  using _CommonImpl = _CommonImplX86;
978  using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>;
979  using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>;
980 #elif _GLIBCXX_SIMD_HAVE_NEON
981  using _CommonImpl = _CommonImplNeon;
982  using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>;
983  using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>;
984 #else
985  using _CommonImpl = _CommonImplBuiltin;
986 #ifdef __ALTIVEC__
987  using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>;
988  using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>;
989 #else
990  using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>;
991  using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>;
992 #endif
993 #endif
994 
995  // }}}
996  // __traits {{{
997  template <typename _Tp>
998  using _MaskValueType = __int_for_sizeof_t<_Tp>;
999 
1000  template <typename _Tp>
1001  using __traits
1002  = conditional_t<_S_is_valid_v<_Tp>,
1003  _GnuTraits<_Tp, _MaskValueType<_Tp>,
1004  _VecBuiltin<_UsedBytes>, _S_size<_Tp>>,
1005  _InvalidTraits>;
1006 
1007  //}}}
1008  // size metadata {{{
1009  template <typename _Tp>
1010  static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1011 
1012  template <typename _Tp>
1013  static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1014 
1015  // }}}
1016  // implicit masks {{{
1017  template <typename _Tp>
1018  using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>;
1019 
1020  template <typename _Tp>
1021  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1022  _S_implicit_mask()
1023  {
1024  using _UV = typename _MaskMember<_Tp>::_BuiltinType;
1025  if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1026  return ~_UV();
1027  else
1028  {
1029  constexpr auto __size = _S_size<_Tp>;
1030  _GLIBCXX_SIMD_USE_CONSTEXPR auto __r
1031  = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1032  { return __i < __size ? -1 : 0; });
1033  return __r;
1034  }
1035  }
1036 
1037  template <typename _Tp>
1038  _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>>
1039  _S_implicit_mask_intrin()
1040  { return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); }
1041 
1042  template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1043  _GLIBCXX_SIMD_INTRINSIC static constexpr _TW
1044  _S_masked(_TW __x)
1045  {
1046  using _Tp = typename _TVT::value_type;
1047  if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1048  return __x;
1049  else
1050  return __and(__as_vector(__x),
1051  __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()));
1052  }
1053 
1054  template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1055  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1056  __make_padding_nonzero(_TW __x)
1057  {
1058  using _Tp = typename _TVT::value_type;
1059  if constexpr (!_S_is_partial<_Tp>)
1060  return __x;
1061  else
1062  {
1063  _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask
1064  = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>());
1065  if constexpr (is_integral_v<_Tp>)
1066  return __or(__x, ~__implicit_mask);
1067  else
1068  {
1069  _GLIBCXX_SIMD_USE_CONSTEXPR auto __one
1070  = __andnot(__implicit_mask,
1071  __vector_broadcast<_S_full_size<_Tp>>(_Tp(1)));
1072  // it's not enough to return `x | 1_in_padding` because the
1073  // padding in x might be inf or nan (independent of
1074  // __FINITE_MATH_ONLY__, because it's about padding bits)
1075  return __or(__and(__x, __implicit_mask), __one);
1076  }
1077  }
1078  }
1079  // }}}
1080  };
1081 
1082 // }}}
1083 // simd_abi::_VecBltnBtmsk {{{
1084 template <int _UsedBytes>
1085  struct simd_abi::_VecBltnBtmsk
1086  {
1087  template <typename _Tp>
1088  static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
1089 
1090  // validity traits {{{
1091  struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
1092 
1093  template <typename _Tp>
1094  struct _IsValidSizeFor
1095  : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
1096  && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64
1097  && (_UsedBytes > 32 || __have_avx512vl))> {};
1098 
1099  // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also
1100  // required.
1101  template <typename _Tp>
1102  struct _IsValid
1103  : conjunction<
1104  _IsValidAbiTag, __bool_constant<__have_avx512f>,
1105  __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>,
1106  __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>,
1107  _IsValidSizeFor<_Tp>> {};
1108 
1109  template <typename _Tp>
1110  static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
1111 
1112  // }}}
1113  // simd/_MaskImpl {{{
1114  #if _GLIBCXX_SIMD_X86INTRIN
1115  using _CommonImpl = _CommonImplX86;
1116  using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>;
1117  using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>;
1118  #else
1119  template <int>
1120  struct _MissingImpl;
1121 
1122  using _CommonImpl = _MissingImpl<_UsedBytes>;
1123  using _SimdImpl = _MissingImpl<_UsedBytes>;
1124  using _MaskImpl = _MissingImpl<_UsedBytes>;
1125  #endif
1126 
1127  // }}}
1128  // __traits {{{
1129  template <typename _Tp>
1130  using _MaskMember = _SimdWrapper<bool, _S_size<_Tp>>;
1131 
1132  template <typename _Tp>
1133  using __traits = conditional_t<
1134  _S_is_valid_v<_Tp>,
1135  _GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>,
1136  _InvalidTraits>;
1137 
1138  //}}}
1139  // size metadata {{{
1140  template <typename _Tp>
1141  static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1142  template <typename _Tp>
1143  static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1144 
1145  // }}}
1146  // implicit mask {{{
1147  private:
1148  template <typename _Tp>
1149  using _ImplicitMask = _SimdWrapper<bool, _S_size<_Tp>>;
1150 
1151  public:
1152  template <size_t _Np>
1153  _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np>
1154  __implicit_mask_n()
1155  {
1156  using _Tp = __bool_storage_member_type_t<_Np>;
1157  return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp();
1158  }
1159 
1160  template <typename _Tp>
1161  _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp>
1162  _S_implicit_mask()
1163  { return __implicit_mask_n<_S_size<_Tp>>(); }
1164 
1165  template <typename _Tp>
1166  _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>>
1167  _S_implicit_mask_intrin()
1168  { return __implicit_mask_n<_S_size<_Tp>>(); }
1169 
1170  template <typename _Tp, size_t _Np>
1171  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1172  _S_masked(_SimdWrapper<_Tp, _Np> __x)
1173  {
1174  if constexpr (is_same_v<_Tp, bool>)
1175  if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0)
1176  return _MaskImpl::_S_bit_and(
1177  __x, _SimdWrapper<_Tp, _Np>(
1178  __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1)));
1179  else
1180  return __x;
1181  else
1182  return _S_masked(__x._M_data);
1183  }
1184 
1185  template <typename _TV>
1186  _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
1187  _S_masked(_TV __x)
1188  {
1189  using _Tp = typename _VectorTraits<_TV>::value_type;
1190  static_assert(
1191  !__is_bitmask_v<_TV>,
1192  "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't "
1193  "know the number of elements. Use _SimdWrapper<bool, N> instead.");
1194  if constexpr (_S_is_partial<_Tp>)
1195  {
1196  constexpr size_t _Np = _S_size<_Tp>;
1197  return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1198  _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(),
1199  _SimdWrapper<_Tp, _Np>(__x));
1200  }
1201  else
1202  return __x;
1203  }
1204 
1205  template <typename _TV, typename _TVT = _VectorTraits<_TV>>
1206  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1207  __make_padding_nonzero(_TV __x)
1208  {
1209  using _Tp = typename _TVT::value_type;
1210  if constexpr (!_S_is_partial<_Tp>)
1211  return __x;
1212  else
1213  {
1214  constexpr size_t _Np = _S_size<_Tp>;
1215  if constexpr (is_integral_v<typename _TVT::value_type>)
1216  return __x
1217  | __generate_vector<_Tp, _S_full_size<_Tp>>(
1218  [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp {
1219  if (__i < _Np)
1220  return 0;
1221  else
1222  return 1;
1223  });
1224  else
1225  return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1226  _S_implicit_mask<_Tp>(),
1227  _SimdWrapper<_Tp, _Np>(
1228  __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))),
1229  _SimdWrapper<_Tp, _Np>(__x))
1230  ._M_data;
1231  }
1232  }
1233 
1234  // }}}
1235  };
1236 
1237 //}}}
1238 // _CommonImplBuiltin {{{
1239 struct _CommonImplBuiltin
1240 {
1241  // _S_converts_via_decomposition{{{
1242  // This lists all cases where a __vector_convert needs to fall back to
1243  // conversion of individual scalars (i.e. decompose the input vector into
1244  // scalars, convert, compose output vector). In those cases, _S_masked_load &
1245  // _S_masked_store prefer to use the _S_bit_iteration implementation.
1246  template <typename _From, typename _To, size_t _ToSize>
1247  static inline constexpr bool __converts_via_decomposition_v
1248  = sizeof(_From) != sizeof(_To);
1249 
1250  // }}}
1251  // _S_load{{{
1252  template <typename _Tp, size_t _Np, size_t _Bytes = _Np * sizeof(_Tp)>
1253  _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np>
1254  _S_load(const void* __p)
1255  {
1256  static_assert(_Np > 1);
1257  static_assert(_Bytes % sizeof(_Tp) == 0);
1258  using _Rp = __vector_type_t<_Tp, _Np>;
1259  if constexpr (sizeof(_Rp) == _Bytes)
1260  {
1261  _Rp __r;
1262  __builtin_memcpy(&__r, __p, _Bytes);
1263  return __r;
1264  }
1265  else
1266  {
1267 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1268  using _Up = conditional_t<
1269  is_integral_v<_Tp>,
1270  conditional_t<_Bytes % 4 == 0,
1271  conditional_t<_Bytes % 8 == 0, long long, int>,
1272  conditional_t<_Bytes % 2 == 0, short, signed char>>,
1273  conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp,
1274  double>>;
1275  using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>;
1276  if constexpr (sizeof(_V) != sizeof(_Rp))
1277  { // on i386 with 4 < _Bytes <= 8
1278  _Rp __r{};
1279  __builtin_memcpy(&__r, __p, _Bytes);
1280  return __r;
1281  }
1282  else
1283 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1284  using _V = _Rp;
1285 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1286  {
1287  _V __r{};
1288  static_assert(_Bytes <= sizeof(_V));
1289  __builtin_memcpy(&__r, __p, _Bytes);
1290  return reinterpret_cast<_Rp>(__r);
1291  }
1292  }
1293  }
1294 
1295  // }}}
1296  // _S_store {{{
1297  template <size_t _Bytes>
1298  _GLIBCXX_SIMD_INTRINSIC static void
1299  _S_memcpy(char* __dst, const char* __src)
1300  {
1301  if constexpr (_Bytes > 0)
1302  {
1303  constexpr size_t _Ns = std::__bit_floor(_Bytes);
1304  __builtin_memcpy(__dst, __src, _Ns);
1305  _S_memcpy<_Bytes - _Ns>(__dst + _Ns, __src + _Ns);
1306  }
1307  }
1308 
1309  template <size_t _ReqBytes = 0, typename _TV>
1310  _GLIBCXX_SIMD_INTRINSIC static void
1311  _S_store(_TV __x, void* __addr)
1312  {
1313  constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes;
1314  static_assert(sizeof(__x) >= _Bytes);
1315 
1316 #if !defined __clang__ && _GLIBCXX_SIMD_WORKAROUND_PR90424
1317  if constexpr (__is_vector_type_v<_TV>)
1318  _S_memcpy<_Bytes>(reinterpret_cast<char*>(__addr), reinterpret_cast<const char*>(&__x));
1319  else
1320 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1321  __builtin_memcpy(__addr, &__x, _Bytes);
1322  }
1323 
1324  template <typename _Tp, size_t _Np>
1325  _GLIBCXX_SIMD_INTRINSIC static void
1326  _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
1327  { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); }
1328 
1329  // }}}
1330  // _S_store_bool_array(_BitMask) {{{
1331  template <size_t _Np, bool _Sanitized>
1332  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1333  _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
1334  {
1335  if constexpr (_Np == 1)
1336  __mem[0] = __x[0];
1337  else if (__builtin_is_constant_evaluated())
1338  {
1339  for (size_t __i = 0; __i < _Np; ++__i)
1340  __mem[__i] = __x[__i];
1341  }
1342  else if constexpr (_Np == 2)
1343  {
1344  short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101;
1345  _S_store<_Np>(__bool2, __mem);
1346  }
1347  else if constexpr (_Np == 3)
1348  {
1349  int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101;
1350  _S_store<_Np>(__bool3, __mem);
1351  }
1352  else
1353  {
1354  __execute_n_times<__div_roundup(_Np, 4)>(
1355  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1356  constexpr int __offset = __i * 4;
1357  constexpr int __remaining = _Np - __offset;
1358  if constexpr (__remaining > 4 && __remaining <= 7)
1359  {
1360  const _ULLong __bool7
1361  = (__x.template _M_extract<__offset>()._M_to_bits()
1362  * 0x40810204081ULL)
1363  & 0x0101010101010101ULL;
1364  _S_store<__remaining>(__bool7, __mem + __offset);
1365  }
1366  else if constexpr (__remaining >= 4)
1367  {
1368  int __bits = __x.template _M_extract<__offset>()._M_to_bits();
1369  if constexpr (__remaining > 7)
1370  __bits &= 0xf;
1371  const int __bool4 = (__bits * 0x204081) & 0x01010101;
1372  _S_store<4>(__bool4, __mem + __offset);
1373  }
1374  });
1375  }
1376  }
1377 
1378  // }}}
1379  // _S_blend{{{
1380  template <typename _Tp, size_t _Np>
1381  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1382  _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
1383  _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
1384  { return __k._M_data ? __at1._M_data : __at0._M_data; }
1385 
1386  // }}}
1387 };
1388 
1389 // }}}
1390 // _SimdImplBuiltin {{{1
1391 template <typename _Abi, typename>
1392  struct _SimdImplBuiltin
1393  {
1394  // member types {{{2
1395  template <typename _Tp>
1396  static constexpr size_t _S_max_store_size = 16;
1397 
1398  using abi_type = _Abi;
1399 
1400  template <typename _Tp>
1401  using _TypeTag = _Tp*;
1402 
1403  template <typename _Tp>
1404  using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
1405 
1406  template <typename _Tp>
1407  using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
1408 
1409  template <typename _Tp>
1410  static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
1411 
1412  template <typename _Tp>
1413  static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
1414 
1415  using _CommonImpl = typename _Abi::_CommonImpl;
1416  using _SuperImpl = typename _Abi::_SimdImpl;
1417  using _MaskImpl = typename _Abi::_MaskImpl;
1418 
1419  // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2
1420  template <typename _Tp, size_t _Np>
1421  _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1422  _M_make_simd(_SimdWrapper<_Tp, _Np> __x)
1423  { return {__private_init, __x}; }
1424 
1425  template <typename _Tp, size_t _Np>
1426  _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1427  _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x)
1428  { return {__private_init, __vector_bitcast<_Tp>(__x)}; }
1429 
1430  // _S_broadcast {{{2
1431  template <typename _Tp>
1432  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1433  _S_broadcast(_Tp __x) noexcept
1434  { return __vector_broadcast<_S_full_size<_Tp>>(__x); }
1435 
1436  // _S_generator {{{2
1437  template <typename _Fp, typename _Tp>
1438  inline static constexpr _SimdMember<_Tp>
1439  _S_generator(_Fp&& __gen, _TypeTag<_Tp>)
1440  {
1441  return __generate_vector<_Tp, _S_full_size<_Tp>>(
1442  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1443  if constexpr (__i < _S_size<_Tp>)
1444  return __gen(__i);
1445  else
1446  return 0;
1447  });
1448  }
1449 
1450  // _S_load {{{2
1451  template <typename _Tp, typename _Up>
1452  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1453  _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
1454  {
1455  constexpr size_t _Np = _S_size<_Tp>;
1456  constexpr size_t __max_load_size
1457  = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw ? 64
1458  : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32
1459  : 16;
1460  constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
1461  if (__builtin_is_constant_evaluated())
1462  return __generate_vector<_Tp, _S_full_size<_Tp>>(
1463  [&](auto __i) constexpr {
1464  return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
1465  });
1466  else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up))
1467  return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>(
1468  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1469  return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
1470  });
1471  else if constexpr (is_same_v<_Up, _Tp>)
1472  return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
1473  _Np * sizeof(_Tp)>(__mem);
1474  else if constexpr (__bytes_to_load <= __max_load_size)
1475  return __convert<_SimdMember<_Tp>>(
1476  _CommonImpl::template _S_load<_Up, _Np>(__mem));
1477  else if constexpr (__bytes_to_load % __max_load_size == 0)
1478  {
1479  constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
1480  constexpr size_t __elements_per_load = _Np / __n_loads;
1481  return __call_with_n_evaluations<__n_loads>(
1482  [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1483  return __convert<_SimdMember<_Tp>>(__uncvted...);
1484  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1485  return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1486  __mem + __i * __elements_per_load);
1487  });
1488  }
1489  else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0
1490  && __max_load_size > 16)
1491  { // e.g. int[] -> <char, 12> with AVX2
1492  constexpr size_t __n_loads
1493  = __bytes_to_load / (__max_load_size / 2);
1494  constexpr size_t __elements_per_load = _Np / __n_loads;
1495  return __call_with_n_evaluations<__n_loads>(
1496  [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1497  return __convert<_SimdMember<_Tp>>(__uncvted...);
1498  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1499  return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1500  __mem + __i * __elements_per_load);
1501  });
1502  }
1503  else // e.g. int[] -> <char, 9>
1504  return __call_with_subscripts(
1505  __mem, make_index_sequence<_Np>(),
1506  [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1507  return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...};
1508  });
1509  }
1510 
1511  // _S_masked_load {{{2
1512  template <typename _Tp, size_t _Np, typename _Up>
1513  static constexpr inline _SimdWrapper<_Tp, _Np>
1514  _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
1515  const _Up* __mem) noexcept
1516  {
1517  _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1518  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1519  __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1520  });
1521  return __merge;
1522  }
1523 
1524  // _S_store {{{2
1525  template <typename _Tp, typename _Up>
1526  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1527  _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
1528  {
1529  // TODO: converting int -> "smaller int" can be optimized with AVX512
1530  constexpr size_t _Np = _S_size<_Tp>;
1531  constexpr size_t __max_store_size
1532  = _SuperImpl::template _S_max_store_size<_Up>;
1533  if (__builtin_is_constant_evaluated())
1534  {
1535  for (size_t __i = 0; __i < _Np; ++__i)
1536  __mem[__i] = __v[__i];
1537  }
1538  else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up))
1539  __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1540  __mem[__i] = __v[__i];
1541  });
1542  else if constexpr (is_same_v<_Up, _Tp>)
1543  _CommonImpl::_S_store(__v, __mem);
1544  else if constexpr (sizeof(_Up) * _Np <= __max_store_size)
1545  _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)),
1546  __mem);
1547  else
1548  {
1549  constexpr size_t __vsize = __max_store_size / sizeof(_Up);
1550  // round up to convert the last partial vector as well:
1551  constexpr size_t __stores = __div_roundup(_Np, __vsize);
1552  constexpr size_t __full_stores = _Np / __vsize;
1553  using _V = __vector_type_t<_Up, __vsize>;
1554  const array<_V, __stores> __converted
1555  = __convert_all<_V, __stores>(__v);
1556  __execute_n_times<__full_stores>(
1557  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1558  _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
1559  });
1560  if constexpr (__full_stores < __stores)
1561  _CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
1562  * sizeof(_Up)>(
1563  __converted[__full_stores], __mem + __full_stores * __vsize);
1564  }
1565  }
1566 
1567  // _S_masked_store_nocvt {{{2
1568  template <typename _Tp, size_t _Np>
1569  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1570  _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k)
1571  {
1572  _BitOps::_S_bit_iteration(
1573  _MaskImpl::_S_to_bits(__k),
1574  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1575  __mem[__i] = __v[__i];
1576  });
1577  }
1578 
1579  // _S_masked_store {{{2
1580  template <typename _TW, typename _TVT = _VectorTraits<_TW>,
1581  typename _Tp = typename _TVT::value_type, typename _Up>
1582  static constexpr inline void
1583  _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept
1584  {
1585  constexpr size_t _TV_size = _S_size<_Tp>;
1586  [[maybe_unused]] const auto __vi = __to_intrin(__v);
1587  constexpr size_t __max_store_size
1588  = _SuperImpl::template _S_max_store_size<_Up>;
1589  if constexpr (
1590  is_same_v<
1591  _Tp,
1592  _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
1593  {
1594  // bitwise or no conversion, reinterpret:
1595  const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1596  if constexpr (__is_bitmask_v<decltype(__k)>)
1597  return _MaskMember<_Up>(__k._M_data);
1598  else
1599  return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k);
1600  }();
1601  _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v),
1602  __mem, __kk);
1603  }
1604  else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up)
1605  && !_CommonImpl::
1606  template __converts_via_decomposition_v<
1607  _Tp, _Up, __max_store_size>)
1608  { // conversion via decomposition is better handled via the
1609  // bit_iteration
1610  // fallback below
1611  constexpr size_t _UW_size
1612  = std::min(_TV_size, __max_store_size / sizeof(_Up));
1613  static_assert(_UW_size <= _TV_size);
1614  using _UW = _SimdWrapper<_Up, _UW_size>;
1615  using _UV = __vector_type_t<_Up, _UW_size>;
1616  using _UAbi = simd_abi::deduce_t<_Up, _UW_size>;
1617  if constexpr (_UW_size == _TV_size) // one convert+store
1618  {
1619  const _UW __converted = __convert<_UW>(__v);
1620  _UAbi::_SimdImpl::_S_masked_store_nocvt(
1621  __converted, __mem,
1622  _UAbi::_MaskImpl::template _S_convert<
1623  __int_for_sizeof_t<_Up>>(__k));
1624  }
1625  else
1626  {
1627  static_assert(_UW_size * sizeof(_Up) == __max_store_size);
1628  constexpr size_t _NFullStores = _TV_size / _UW_size;
1629  constexpr size_t _NAllStores
1630  = __div_roundup(_TV_size, _UW_size);
1631  constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
1632  const array<_UV, _NAllStores> __converted
1633  = __convert_all<_UV, _NAllStores>(__v);
1634  __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1635  _UAbi::_SimdImpl::_S_masked_store_nocvt(
1636  _UW(__converted[__i]), __mem + __i * _UW_size,
1637  _UAbi::_MaskImpl::template _S_convert<
1638  __int_for_sizeof_t<_Up>>(
1639  __extract_part<__i, _NParts>(__k.__as_full_vector())));
1640  });
1641  if constexpr (_NAllStores
1642  > _NFullStores) // one partial at the end
1643  _UAbi::_SimdImpl::_S_masked_store_nocvt(
1644  _UW(__converted[_NFullStores]),
1645  __mem + _NFullStores * _UW_size,
1646  _UAbi::_MaskImpl::template _S_convert<
1647  __int_for_sizeof_t<_Up>>(
1648  __extract_part<_NFullStores, _NParts>(
1649  __k.__as_full_vector())));
1650  }
1651  }
1652  else
1653  _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1654  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1655  __mem[__i] = static_cast<_Up>(__v[__i]);
1656  });
1657  }
1658 
1659  // _S_complement {{{2
1660  template <typename _Tp, size_t _Np>
1661  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1662  _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept
1663  {
1664  if constexpr (is_floating_point_v<_Tp>)
1665  return __vector_bitcast<_Tp>(~__vector_bitcast<__int_for_sizeof_t<_Tp>>(__x));
1666  else
1667  return ~__x._M_data;
1668  }
1669 
1670  // _S_unary_minus {{{2
1671  template <typename _Tp, size_t _Np>
1672  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1673  _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept
1674  {
1675  // GCC doesn't use the psign instructions, but pxor & psub seem to be
1676  // just as good a choice as pcmpeqd & psign. So meh.
1677  return -__x._M_data;
1678  }
1679 
1680  // arithmetic operators {{{2
1681  template <typename _Tp, size_t _Np>
1682  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1683  _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1684  { return __x._M_data + __y._M_data; }
1685 
1686  template <typename _Tp, size_t _Np>
1687  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1688  _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1689  { return __x._M_data - __y._M_data; }
1690 
1691  template <typename _Tp, size_t _Np>
1692  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1693  _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1694  { return __x._M_data * __y._M_data; }
1695 
1696  template <typename _Tp, size_t _Np>
1697  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1698  _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1699  {
1700  // Note that division by 0 is always UB, so we must ensure we avoid the
1701  // case for partial registers
1702  if constexpr (!_Abi::template _S_is_partial<_Tp>)
1703  return __x._M_data / __y._M_data;
1704  else
1705  return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data);
1706  }
1707 
1708  template <typename _Tp, size_t _Np>
1709  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1710  _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1711  {
1712  if constexpr (!_Abi::template _S_is_partial<_Tp>)
1713  return __x._M_data % __y._M_data;
1714  else
1715  return __as_vector(__x)
1716  % _Abi::__make_padding_nonzero(__as_vector(__y));
1717  }
1718 
1719  template <typename _Tp, size_t _Np>
1720  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1721  _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1722  { return __and(__x, __y); }
1723 
1724  template <typename _Tp, size_t _Np>
1725  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1726  _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1727  { return __or(__x, __y); }
1728 
1729  template <typename _Tp, size_t _Np>
1730  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1731  _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1732  { return __xor(__x, __y); }
1733 
1734  template <typename _Tp, size_t _Np>
1735  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1736  _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1737  { return __x._M_data << __y._M_data; }
1738 
1739  template <typename _Tp, size_t _Np>
1740  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1741  _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1742  { return __x._M_data >> __y._M_data; }
1743 
1744  template <typename _Tp, size_t _Np>
1745  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1746  _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y)
1747  { return __x._M_data << __y; }
1748 
1749  template <typename _Tp, size_t _Np>
1750  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1751  _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y)
1752  { return __x._M_data >> __y; }
1753 
1754  // compares {{{2
1755  // _S_equal_to {{{3
1756  template <typename _Tp, size_t _Np>
1757  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1758  _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1759  { return __x._M_data == __y._M_data; }
1760 
1761  // _S_not_equal_to {{{3
1762  template <typename _Tp, size_t _Np>
1763  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1764  _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1765  { return __x._M_data != __y._M_data; }
1766 
1767  // _S_less {{{3
1768  template <typename _Tp, size_t _Np>
1769  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1770  _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1771  { return __x._M_data < __y._M_data; }
1772 
1773  // _S_less_equal {{{3
1774  template <typename _Tp, size_t _Np>
1775  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1776  _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1777  { return __x._M_data <= __y._M_data; }
1778 
1779  // _S_negate {{{2
1780  template <typename _Tp, size_t _Np>
1781  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1782  _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
1783  { return !__x._M_data; }
1784 
1785  // _S_min, _S_max, _S_minmax {{{2
1786  template <typename _Tp, size_t _Np>
1787  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1788  _SimdWrapper<_Tp, _Np>
1789  _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1790  { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; }
1791 
1792  template <typename _Tp, size_t _Np>
1793  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1794  _SimdWrapper<_Tp, _Np>
1795  _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1796  { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; }
1797 
1798  template <typename _Tp, size_t _Np>
1799  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1800  pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>>
1801  _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1802  {
1803  return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data,
1804  __a._M_data < __b._M_data ? __b._M_data : __a._M_data};
1805  }
1806 
1807  // reductions {{{2
1808  template <size_t _Np, size_t... _Is, size_t... _Zeros, typename _Tp,
1809  typename _BinaryOperation>
1810  _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1811  _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>,
1812  simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1813  {
1814  using _V = __vector_type_t<_Tp, _Np / 2>;
1815  static_assert(sizeof(_V) <= sizeof(__x));
1816  // _S_full_size is the size of the smallest native SIMD register that
1817  // can store _Np/2 elements:
1818  using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>;
1819  using _HalfSimd = __deduced_simd<_Tp, _Np / 2>;
1820  const auto __xx = __as_vector(__x);
1821  return _HalfSimd::abi_type::_SimdImpl::_S_reduce(
1822  static_cast<_HalfSimd>(__as_vector(__binary_op(
1823  static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)),
1824  static_cast<_FullSimd>(__intrin_bitcast<_V>(
1825  __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>(
1826  __xx)))))),
1827  __binary_op);
1828  }
1829 
1830  template <typename _Tp, typename _BinaryOperation>
1831  _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1832  _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1833  {
1834  constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
1835  if constexpr (_Np == 1)
1836  return __x[0];
1837  else if constexpr (_Np == 2)
1838  return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1839  simd<_Tp, simd_abi::scalar>(__x[1]))[0];
1840  else if (__builtin_is_constant_evaluated())
1841  {
1842  simd<_Tp, simd_abi::scalar> __acc = __x[0];
1843  for (size_t __i = 1; __i < _Np; ++__i)
1844  __acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i]));
1845  return __acc[0];
1846  }
1847  else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{
1848  {
1849  [[maybe_unused]] constexpr auto __full_size
1850  = _Abi::template _S_full_size<_Tp>;
1851  if constexpr (_Np == 3)
1852  return __binary_op(
1853  __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1854  simd<_Tp, simd_abi::scalar>(__x[1])),
1855  simd<_Tp, simd_abi::scalar>(__x[2]))[0];
1856  else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1857  plus<>>)
1858  {
1859  using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1860  return _Ap::_SimdImpl::_S_reduce(
1861  simd<_Tp, _Ap>(__private_init,
1862  _Abi::_S_masked(__as_vector(__x))),
1863  __binary_op);
1864  }
1865  else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1866  multiplies<>>)
1867  {
1868  using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1869  using _TW = _SimdWrapper<_Tp, __full_size>;
1870  _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
1871  = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
1872  _GLIBCXX_SIMD_USE_CONSTEXPR _TW __one
1873  = __vector_broadcast<__full_size>(_Tp(1));
1874  const _TW __x_full = __data(__x).__as_full_vector();
1875  const _TW __x_padded_with_ones
1876  = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one,
1877  __x_full);
1878  return _Ap::_SimdImpl::_S_reduce(
1879  simd<_Tp, _Ap>(__private_init, __x_padded_with_ones),
1880  __binary_op);
1881  }
1882  else if constexpr (_Np & 1)
1883  {
1884  using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>;
1885  return __binary_op(
1886  simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
1887  simd<_Tp, _Ap>(
1888  __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>(
1889  __as_vector(__x))),
1890  __binary_op)),
1891  simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0];
1892  }
1893  else
1894  return _S_reduce_partial<_Np>(
1895  make_index_sequence<_Np / 2>(),
1896  make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op);
1897  } //}}}
1898  else if constexpr (sizeof(__x) == 16) //{{{
1899  {
1900  if constexpr (_Np == 16)
1901  {
1902  const auto __y = __data(__x);
1903  __x = __binary_op(
1904  _M_make_simd<_Tp, _Np>(
1905  __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
1906  7, 7>(__y)),
1907  _M_make_simd<_Tp, _Np>(
1908  __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
1909  14, 14, 15, 15>(__y)));
1910  }
1911  if constexpr (_Np >= 8)
1912  {
1913  const auto __y = __vector_bitcast<short>(__data(__x));
1914  __x = __binary_op(
1915  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1916  __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))),
1917  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1918  __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y))));
1919  }
1920  if constexpr (_Np >= 4)
1921  {
1922  using _Up = conditional_t<is_floating_point_v<_Tp>, float, int>;
1923  const auto __y = __vector_bitcast<_Up>(__data(__x));
1924  __x = __binary_op(__x,
1925  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1926  __vector_permute<3, 2, 1, 0>(__y))));
1927  }
1928  using _Up = conditional_t<is_floating_point_v<_Tp>, double, _LLong>;
1929  const auto __y = __vector_bitcast<_Up>(__data(__x));
1930  __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1931  __vector_permute<1, 1>(__y))));
1932  return __x[0];
1933  } //}}}
1934  else
1935  {
1936  static_assert(sizeof(__x) > __min_vector_size<_Tp>);
1937  static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2
1938  using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>;
1939  using _V = simd<_Tp, _Ap>;
1940  return _Ap::_SimdImpl::_S_reduce(
1941  __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))),
1942  _V(__private_init,
1943  __extract<1, 2>(__as_vector(__x)))),
1944  static_cast<_BinaryOperation&&>(__binary_op));
1945  }
1946  }
1947 
1948  // math {{{2
1949  // frexp, modf and copysign implemented in simd_math.h
1950 #define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \
1951  template <typename _Tp, typename... _More> \
1952  static _Tp \
1953  _S_##__name(const _Tp& __x, const _More&... __more) \
1954  { \
1955  return __generate_vector<_Tp>( \
1956  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1957  return __name(__x[__i], __more[__i]...); \
1958  }); \
1959  }
1960 
1961 #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \
1962  template <typename _Tp, typename... _More> \
1963  static typename _Tp::mask_type \
1964  _S_##__name(const _Tp& __x, const _More&... __more) \
1965  { \
1966  return __generate_vector<_Tp>( \
1967  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1968  return __name(__x[__i], __more[__i]...); \
1969  }); \
1970  }
1971 
1972 #define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \
1973  template <typename _Tp, typename... _More> \
1974  static auto \
1975  _S_##__name(const _Tp& __x, const _More&... __more) \
1976  { \
1977  return __fixed_size_storage_t<_RetTp, \
1978  _VectorTraits<_Tp>::_S_partial_width>:: \
1979  _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1980  return __meta._S_generator( \
1981  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1982  return __name(__x[__meta._S_offset + __i], \
1983  __more[__meta._S_offset + __i]...); \
1984  }, \
1985  static_cast<_RetTp*>(nullptr)); \
1986  }); \
1987  }
1988 
1989  _GLIBCXX_SIMD_MATH_FALLBACK(acos)
1990  _GLIBCXX_SIMD_MATH_FALLBACK(asin)
1991  _GLIBCXX_SIMD_MATH_FALLBACK(atan)
1992  _GLIBCXX_SIMD_MATH_FALLBACK(atan2)
1993  _GLIBCXX_SIMD_MATH_FALLBACK(cos)
1994  _GLIBCXX_SIMD_MATH_FALLBACK(sin)
1995  _GLIBCXX_SIMD_MATH_FALLBACK(tan)
1996  _GLIBCXX_SIMD_MATH_FALLBACK(acosh)
1997  _GLIBCXX_SIMD_MATH_FALLBACK(asinh)
1998  _GLIBCXX_SIMD_MATH_FALLBACK(atanh)
1999  _GLIBCXX_SIMD_MATH_FALLBACK(cosh)
2000  _GLIBCXX_SIMD_MATH_FALLBACK(sinh)
2001  _GLIBCXX_SIMD_MATH_FALLBACK(tanh)
2002  _GLIBCXX_SIMD_MATH_FALLBACK(exp)
2003  _GLIBCXX_SIMD_MATH_FALLBACK(exp2)
2004  _GLIBCXX_SIMD_MATH_FALLBACK(expm1)
2005  _GLIBCXX_SIMD_MATH_FALLBACK(ldexp)
2006  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
2007  _GLIBCXX_SIMD_MATH_FALLBACK(log)
2008  _GLIBCXX_SIMD_MATH_FALLBACK(log10)
2009  _GLIBCXX_SIMD_MATH_FALLBACK(log1p)
2010  _GLIBCXX_SIMD_MATH_FALLBACK(log2)
2011  _GLIBCXX_SIMD_MATH_FALLBACK(logb)
2012 
2013  // modf implemented in simd_math.h
2014  _GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
2015  _GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
2016  _GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
2017  _GLIBCXX_SIMD_MATH_FALLBACK(fabs)
2018  _GLIBCXX_SIMD_MATH_FALLBACK(pow)
2019  _GLIBCXX_SIMD_MATH_FALLBACK(sqrt)
2020  _GLIBCXX_SIMD_MATH_FALLBACK(erf)
2021  _GLIBCXX_SIMD_MATH_FALLBACK(erfc)
2022  _GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
2023  _GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
2024 
2025  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
2026  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
2027 
2028  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
2029  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
2030 
2031  _GLIBCXX_SIMD_MATH_FALLBACK(fmod)
2032  _GLIBCXX_SIMD_MATH_FALLBACK(remainder)
2033 
2034  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2035  static _Tp
2036  _S_remquo(const _Tp __x, const _Tp __y,
2037  __fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
2038  {
2039  return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2040  int __tmp;
2041  auto __r = remquo(__x[__i], __y[__i], &__tmp);
2042  __z->_M_set(__i, __tmp);
2043  return __r;
2044  });
2045  }
2046 
2047  // copysign in simd_math.h
2048  _GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
2049  _GLIBCXX_SIMD_MATH_FALLBACK(fdim)
2050  _GLIBCXX_SIMD_MATH_FALLBACK(fmax)
2051  _GLIBCXX_SIMD_MATH_FALLBACK(fmin)
2052  _GLIBCXX_SIMD_MATH_FALLBACK(fma)
2053 
2054  template <typename _Tp, size_t _Np>
2055  static constexpr _MaskMember<_Tp>
2056  _S_isgreater(_SimdWrapper<_Tp, _Np> __x,
2057  _SimdWrapper<_Tp, _Np> __y) noexcept
2058  {
2059  using _Ip = __int_for_sizeof_t<_Tp>;
2060  const auto __xn = __vector_bitcast<_Ip>(__x);
2061  const auto __yn = __vector_bitcast<_Ip>(__y);
2062  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2063  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2064  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2065  __xp > __yp);
2066  }
2067 
2068  template <typename _Tp, size_t _Np>
2069  static constexpr _MaskMember<_Tp>
2070  _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x,
2071  _SimdWrapper<_Tp, _Np> __y) noexcept
2072  {
2073  using _Ip = __int_for_sizeof_t<_Tp>;
2074  const auto __xn = __vector_bitcast<_Ip>(__x);
2075  const auto __yn = __vector_bitcast<_Ip>(__y);
2076  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2077  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2078  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2079  __xp >= __yp);
2080  }
2081 
2082  template <typename _Tp, size_t _Np>
2083  static constexpr _MaskMember<_Tp>
2084  _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept
2085  {
2086  using _Ip = __int_for_sizeof_t<_Tp>;
2087  const auto __xn = __vector_bitcast<_Ip>(__x);
2088  const auto __yn = __vector_bitcast<_Ip>(__y);
2089  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2090  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2091  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2092  __xp < __yp);
2093  }
2094 
2095  template <typename _Tp, size_t _Np>
2096  static constexpr _MaskMember<_Tp>
2097  _S_islessequal(_SimdWrapper<_Tp, _Np> __x,
2098  _SimdWrapper<_Tp, _Np> __y) noexcept
2099  {
2100  using _Ip = __int_for_sizeof_t<_Tp>;
2101  const auto __xn = __vector_bitcast<_Ip>(__x);
2102  const auto __yn = __vector_bitcast<_Ip>(__y);
2103  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2104  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2105  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2106  __xp <= __yp);
2107  }
2108 
2109  template <typename _Tp, size_t _Np>
2110  static constexpr _MaskMember<_Tp>
2111  _S_islessgreater(_SimdWrapper<_Tp, _Np> __x,
2112  _SimdWrapper<_Tp, _Np> __y) noexcept
2113  {
2114  return __andnot(_SuperImpl::_S_isunordered(__x, __y),
2115  _SuperImpl::_S_not_equal_to(__x, __y));
2116  }
2117 
2118 #undef _GLIBCXX_SIMD_MATH_FALLBACK
2119 #undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET
2120 #undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
2121  // _S_abs {{{3
2122  template <typename _Tp, size_t _Np>
2123  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2124  _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept
2125  {
2126  // if (__builtin_is_constant_evaluated())
2127  // {
2128  // return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2129  // }
2130  if constexpr (is_floating_point_v<_Tp>)
2131  // `v < 0 ? -v : v` cannot compile to the efficient implementation of
2132  // masking the signbit off because it must consider v == -0
2133 
2134  // ~(-0.) & v would be easy, but breaks with fno-signed-zeros
2135  return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data);
2136  else
2137  return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2138  }
2139 
2140  // }}}3
2141  // _S_plus_minus {{{
2142  // Returns __x + __y - __y without -fassociative-math optimizing to __x.
2143  // - _TV must be __vector_type_t<floating-point type, N>.
2144  // - _UV must be _TV or floating-point type.
2145  template <typename _TV, typename _UV>
2146  _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
2147  _S_plus_minus(_TV __x, _UV __y) noexcept
2148  {
2149 #if defined __i386__ && !defined __SSE_MATH__
2150  if constexpr (sizeof(__x) == 8)
2151  { // operations on __x would use the FPU
2152  static_assert(is_same_v<_TV, __vector_type_t<float, 2>>);
2153  const auto __x4 = __vector_bitcast<float, 4>(__x);
2154  if constexpr (is_same_v<_TV, _UV>)
2155  return __vector_bitcast<float, 2>(
2156  _S_plus_minus(__x4, __vector_bitcast<float, 4>(__y)));
2157  else
2158  return __vector_bitcast<float, 2>(_S_plus_minus(__x4, __y));
2159  }
2160 #endif
2161 #if !defined __clang__ && __GCC_IEC_559 == 0
2162  if (__builtin_is_constant_evaluated()
2163  || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
2164  return (__x + __y) - __y;
2165  else
2166  return [&] {
2167  __x += __y;
2168  if constexpr(__have_sse)
2169  {
2170  if constexpr (sizeof(__x) >= 16)
2171  asm("" : "+x"(__x));
2172  else if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2173  asm("" : "+x"(__x[0]), "+x"(__x[1]));
2174  else
2175  __assert_unreachable<_TV>();
2176  }
2177  else if constexpr(__have_neon)
2178  asm("" : "+w"(__x));
2179  else if constexpr (__have_power_vmx)
2180  {
2181  if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2182  asm("" : "+fgr"(__x[0]), "+fgr"(__x[1]));
2183  else
2184  asm("" : "+v"(__x));
2185  }
2186  else
2187  asm("" : "+g"(__x));
2188  return __x - __y;
2189  }();
2190 #else
2191  return (__x + __y) - __y;
2192 #endif
2193  }
2194 
2195  // }}}
2196  // _S_nearbyint {{{3
2197  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2198  _GLIBCXX_SIMD_INTRINSIC static _Tp
2199  _S_nearbyint(_Tp __x_) noexcept
2200  {
2201  using value_type = typename _TVT::value_type;
2202  using _V = typename _TVT::type;
2203  const _V __x = __x_;
2204  const _V __absx = __and(__x, _S_absmask<_V>);
2205  static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<value_type>);
2206  _GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs
2207  = _V() + (1ull << (__digits_v<value_type> - 1));
2208  const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs);
2209  const _V __shifted = _S_plus_minus(__x, __shifter);
2210  return __absx < __shifter_abs ? __shifted : __x;
2211  }
2212 
2213  // _S_rint {{{3
2214  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2215  _GLIBCXX_SIMD_INTRINSIC static _Tp
2216  _S_rint(_Tp __x) noexcept
2217  { return _SuperImpl::_S_nearbyint(__x); }
2218 
2219  // _S_trunc {{{3
2220  template <typename _Tp, size_t _Np>
2221  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2222  _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2223  {
2224  using _V = __vector_type_t<_Tp, _Np>;
2225  const _V __absx = __and(__x._M_data, _S_absmask<_V>);
2226  static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>);
2227  constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1);
2228  _V __truncated = _S_plus_minus(__absx, __shifter);
2229  __truncated -= __truncated > __absx ? _V() + 1 : _V();
2230  return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated)
2231  : __x._M_data;
2232  }
2233 
2234  // _S_round {{{3
2235  template <typename _Tp, size_t _Np>
2236  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2237  _S_round(_SimdWrapper<_Tp, _Np> __x)
2238  {
2239  const auto __abs_x = _SuperImpl::_S_abs(__x);
2240  const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data;
2241  const auto __r_abs // round(abs(x)) =
2242  = __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0);
2243  return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs);
2244  }
2245 
2246  // _S_floor {{{3
2247  template <typename _Tp, size_t _Np>
2248  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2249  _S_floor(_SimdWrapper<_Tp, _Np> __x)
2250  {
2251  const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2252  const auto __negative_input
2253  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2254  const auto __mask
2255  = __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2256  return __or(__andnot(__mask, __y),
2257  __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1)));
2258  }
2259 
2260  // _S_ceil {{{3
2261  template <typename _Tp, size_t _Np>
2262  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2263  _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2264  {
2265  const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2266  const auto __negative_input
2267  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2268  const auto __inv_mask
2269  = __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2270  return __or(__and(__inv_mask, __y),
2271  __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1)));
2272  }
2273 
2274  // _S_isnan {{{3
2275  template <typename _Tp, size_t _Np>
2276  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2277  _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2278  {
2279 #if __FINITE_MATH_ONLY__
2280  return {}; // false
2281 #elif !defined __SUPPORT_SNAN__
2282  return ~(__x._M_data == __x._M_data);
2283 #elif defined __STDC_IEC_559__
2284  using _Ip = __int_for_sizeof_t<_Tp>;
2285  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2286  const auto __infn
2287  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
2288  return __infn < __absn;
2289 #else
2290 #error "Not implemented: how to support SNaN but non-IEC559 floating-point?"
2291 #endif
2292  }
2293 
2294  // _S_isfinite {{{3
2295  template <typename _Tp, size_t _Np>
2296  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2297  _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2298  {
2299 #if __FINITE_MATH_ONLY__
2300  using _UV = typename _MaskMember<_Tp>::_BuiltinType;
2301  _GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV();
2302  return __alltrue;
2303 #else
2304  // if all exponent bits are set, __x is either inf or NaN
2305  using _Ip = __int_for_sizeof_t<_Tp>;
2306  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2307  const auto __maxn
2308  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2309  return __absn <= __maxn;
2310 #endif
2311  }
2312 
2313  // _S_isunordered {{{3
2314  template <typename _Tp, size_t _Np>
2315  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2316  _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2317  { return __or(_S_isnan(__x), _S_isnan(__y)); }
2318 
2319  // _S_signbit {{{3
2320  template <typename _Tp, size_t _Np>
2321  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2322  _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2323  {
2324  using _Ip = __int_for_sizeof_t<_Tp>;
2325  return __vector_bitcast<_Ip>(__x) < 0;
2326  // Arithmetic right shift (SRA) would also work (instead of compare), but
2327  // 64-bit SRA isn't available on x86 before AVX512. And in general,
2328  // compares are more likely to be efficient than SRA.
2329  }
2330 
2331  // _S_isinf {{{3
2332  template <typename _Tp, size_t _Np>
2333  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2334  _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2335  {
2336 #if __FINITE_MATH_ONLY__
2337  return {}; // false
2338 #else
2339  return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x),
2340  __vector_broadcast<_Np>(
2341  __infinity_v<_Tp>));
2342  // alternative:
2343  // compare to inf using the corresponding integer type
2344  /*
2345  return
2346  __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>(
2347  _S_abs(__x)._M_data)
2348  ==
2349  __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>(
2350  __infinity_v<_Tp>)));
2351  */
2352 #endif
2353  }
2354 
2355  // _S_isnormal {{{3
2356  template <typename _Tp, size_t _Np>
2357  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2358  _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
2359  {
2360  using _Ip = __int_for_sizeof_t<_Tp>;
2361  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2362  const auto __minn
2363  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>));
2364 #if __FINITE_MATH_ONLY__
2365  return __absn >= __minn;
2366 #else
2367  const auto __maxn
2368  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2369  return __minn <= __absn && __absn <= __maxn;
2370 #endif
2371  }
2372 
2373  // _S_fpclassify {{{3
2374  template <typename _Tp, size_t _Np>
2375  _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
2376  _S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
2377  {
2378  using _I = __int_for_sizeof_t<_Tp>;
2379  const auto __xn
2380  = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
2381  constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
2382  _GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
2383  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
2384 
2385  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
2386  = __vector_broadcast<_NI, _I>(FP_NORMAL);
2387 #if !__FINITE_MATH_ONLY__
2388  _GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
2389  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
2390  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
2391  = __vector_broadcast<_NI, _I>(FP_NAN);
2392  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
2393  = __vector_broadcast<_NI, _I>(FP_INFINITE);
2394 #endif
2395 #ifndef __FAST_MATH__
2396  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
2397  = __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
2398 #endif
2399  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
2400  = __vector_broadcast<_NI, _I>(FP_ZERO);
2401 
2402  __vector_type_t<_I, _NI>
2403  __tmp = __xn < __minn
2404  #ifdef __FAST_MATH__
2405  ? __fp_zero
2406  #else
2407  ? (__xn == 0 ? __fp_zero : __fp_subnormal)
2408  #endif
2409  #if __FINITE_MATH_ONLY__
2410  : __fp_normal;
2411  #else
2412  : (__xn < __infn ? __fp_normal
2413  : (__xn == __infn ? __fp_infinite : __fp_nan));
2414  #endif
2415 
2416  if constexpr (sizeof(_I) == sizeof(int))
2417  {
2418  using _FixedInt = __fixed_size_storage_t<int, _Np>;
2419  const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
2420  if constexpr (_FixedInt::_S_tuple_size == 1)
2421  return {__as_int};
2422  else if constexpr (_FixedInt::_S_tuple_size == 2
2423  && is_same_v<
2424  typename _FixedInt::_SecondType::_FirstAbi,
2425  simd_abi::scalar>)
2426  return {__extract<0, 2>(__as_int), __as_int[_Np - 1]};
2427  else if constexpr (_FixedInt::_S_tuple_size == 2)
2428  return {__extract<0, 2>(__as_int),
2429  __auto_bitcast(__extract<1, 2>(__as_int))};
2430  else
2431  __assert_unreachable<_Tp>();
2432  }
2433  else if constexpr (_Np == 2 && sizeof(_I) == 8
2434  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 2)
2435  {
2436  const auto __aslong = __vector_bitcast<_LLong>(__tmp);
2437  return {int(__aslong[0]), {int(__aslong[1])}};
2438  }
2439 #if _GLIBCXX_SIMD_X86INTRIN
2440  else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32
2441  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2442  return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
2443  __to_intrin(__hi128(__tmp)))};
2444  else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64
2445  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2446  return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
2447 #endif // _GLIBCXX_SIMD_X86INTRIN
2448  else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2449  return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
2450  [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2451  return __make_wrapper<int>(__l...);
2452  })};
2453  else
2454  __assert_unreachable<_Tp>();
2455  }
2456 
2457  // _S_increment & _S_decrement{{{2
2458  template <typename _Tp, size_t _Np>
2459  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2460  _S_increment(_SimdWrapper<_Tp, _Np>& __x)
2461  { __x = __x._M_data + 1; }
2462 
2463  template <typename _Tp, size_t _Np>
2464  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2465  _S_decrement(_SimdWrapper<_Tp, _Np>& __x)
2466  { __x = __x._M_data - 1; }
2467 
2468  // smart_reference access {{{2
2469  template <typename _Tp, size_t _Np, typename _Up>
2470  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2471  _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
2472  { __v._M_set(__i, static_cast<_Up&&>(__x)); }
2473 
2474  // _S_masked_assign{{{2
2475  template <typename _Tp, typename _K, size_t _Np>
2476  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2477  _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2478  __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2479  {
2480  if (__k._M_is_constprop_none_of())
2481  return;
2482  else if (__k._M_is_constprop_all_of())
2483  __lhs = __rhs;
2484  else
2485  __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs);
2486  }
2487 
2488  template <typename _Tp, typename _K, size_t _Np>
2489  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2490  _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2491  __type_identity_t<_Tp> __rhs)
2492  {
2493  if (__k._M_is_constprop_none_of())
2494  return;
2495  else if (__k._M_is_constprop_all_of())
2496  __lhs = __vector_broadcast<_Np>(__rhs);
2497  else if (__builtin_constant_p(__rhs) && __rhs == 0)
2498  {
2499  if constexpr (!is_same_v<bool, _K>)
2500  // the __andnot optimization only makes sense if __k._M_data is a
2501  // vector register
2502  __lhs._M_data
2503  = __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data);
2504  else
2505  // for AVX512/__mmask, a _mm512_maskz_mov is best
2506  __lhs
2507  = _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>());
2508  }
2509  else
2510  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2511  _SimdWrapper<_Tp, _Np>(
2512  __vector_broadcast<_Np>(__rhs)));
2513  }
2514 
2515  // _S_masked_cassign {{{2
2516  template <typename _Op, typename _Tp, typename _K, size_t _Np>
2517  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2518  _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2519  _SimdWrapper<_Tp, _Np>& __lhs,
2520  const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs,
2521  _Op __op)
2522  {
2523  if (__k._M_is_constprop_none_of())
2524  return;
2525  else if (__k._M_is_constprop_all_of())
2526  __lhs = __op(_SuperImpl{}, __lhs, __rhs);
2527  else
2528  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2529  __op(_SuperImpl{}, __lhs, __rhs));
2530  }
2531 
2532  template <typename _Op, typename _Tp, typename _K, size_t _Np>
2533  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2534  _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2535  _SimdWrapper<_Tp, _Np>& __lhs,
2536  const __type_identity_t<_Tp> __rhs, _Op __op)
2537  { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); }
2538 
2539  // _S_masked_unary {{{2
2540  template <template <typename> class _Op, typename _Tp, typename _K,
2541  size_t _Np>
2542  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2543  _S_masked_unary(const _SimdWrapper<_K, _Np> __k,
2544  const _SimdWrapper<_Tp, _Np> __v)
2545  {
2546  if (__k._M_is_constprop_none_of())
2547  return __v;
2548  auto __vv = _M_make_simd(__v);
2549  _Op<decltype(__vv)> __op;
2550  if (__k._M_is_constprop_all_of())
2551  return __data(__op(__vv));
2552  else if constexpr (is_same_v<_Op<void>, __increment<void>>)
2553  {
2554  static_assert(not std::is_same_v<_K, bool>);
2555  if constexpr (is_integral_v<_Tp>)
2556  // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2557  return __v._M_data - __vector_bitcast<_Tp>(__k._M_data);
2558  else if constexpr (not __have_avx2)
2559  return __v._M_data
2560  + __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2561  _K, _Tp(1)));
2562  // starting with AVX2 it is more efficient to blend after add
2563  }
2564  else if constexpr (is_same_v<_Op<void>, __decrement<void>>)
2565  {
2566  static_assert(not std::is_same_v<_K, bool>);
2567  if constexpr (is_integral_v<_Tp>)
2568  // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2569  return __v._M_data + __vector_bitcast<_Tp>(__k._M_data);
2570  else if constexpr (not __have_avx2)
2571  return __v._M_data
2572  - __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2573  _K, _Tp(1)));
2574  // starting with AVX2 it is more efficient to blend after sub
2575  }
2576  return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
2577  }
2578 
2579  //}}}2
2580  };
2581 
2582 // _MaskImplBuiltinMixin {{{1
2583 struct _MaskImplBuiltinMixin
2584 {
2585  template <typename _Tp>
2586  using _TypeTag = _Tp*;
2587 
2588  // _S_to_maskvector {{{
2589  template <typename _Up, size_t _ToN = 1>
2590  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2591  _S_to_maskvector(bool __x)
2592  {
2593  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2594  return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
2595  : __vector_type_t<_Up, _ToN>{};
2596  }
2597 
2598  template <typename _Up, size_t _UpN = 0, size_t _Np, bool _Sanitized,
2599  size_t _ToN = _UpN == 0 ? _Np : _UpN>
2600  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2601  _S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
2602  {
2603  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2604  return __generate_vector<__vector_type_t<_Up, _ToN>>(
2605  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2606  if constexpr (__i < _Np)
2607  return __x[__i] ? ~_Up() : _Up();
2608  else
2609  return _Up();
2610  });
2611  }
2612 
2613  template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
2614  size_t _ToN = _UpN == 0 ? _Np : _UpN>
2615  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2616  _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
2617  {
2618  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2619  using _TW = _SimdWrapper<_Tp, _Np>;
2620  using _UW = _SimdWrapper<_Up, _ToN>;
2621  if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW))
2622  return __wrapper_bitcast<_Up, _ToN>(__x);
2623  else if constexpr (is_same_v<_Tp, bool>) // bits -> vector
2624  return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data));
2625  else
2626  { // vector -> vector
2627  /*
2628  [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data);
2629  if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) ==
2630  16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr
2631  (sizeof(_Tp) == 4 && sizeof(_Up) == 2
2632  && sizeof(__y) == 16)
2633  return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y);
2634  else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
2635  && sizeof(__y) == 16)
2636  return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y);
2637  else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
2638  && sizeof(__y) == 16)
2639  return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1,
2640  -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 &&
2641  sizeof(_Up) == 1
2642  && sizeof(__y) == 16)
2643  return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
2644  -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 &&
2645  sizeof(_Up) == 1
2646  && sizeof(__y) == 16)
2647  return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2648  -1, -1, -1, -1, -1>(__y); else
2649  */
2650  {
2651  return __generate_vector<__vector_type_t<_Up, _ToN>>(
2652  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2653  if constexpr (__i < _Np)
2654  return _Up(__x[__i.value]);
2655  else
2656  return _Up();
2657  });
2658  }
2659  }
2660  }
2661 
2662  // }}}
2663  // _S_to_bits {{{
2664  template <typename _Tp, size_t _Np>
2665  _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
2666  _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
2667  {
2668  static_assert(!is_same_v<_Tp, bool>);
2669  static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong));
2670  using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
2671  const auto __bools
2672  = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1);
2673  _ULLong __r = 0;
2674  __execute_n_times<_Np>(
2675  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2676  __r |= _ULLong(__bools[__i.value]) << __i;
2677  });
2678  return __r;
2679  }
2680 
2681  // }}}
2682 };
2683 
2684 // _MaskImplBuiltin {{{1
2685 template <typename _Abi, typename>
2686  struct _MaskImplBuiltin : _MaskImplBuiltinMixin
2687  {
2688  using _MaskImplBuiltinMixin::_S_to_bits;
2689  using _MaskImplBuiltinMixin::_S_to_maskvector;
2690 
2691  // member types {{{
2692  template <typename _Tp>
2693  using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
2694 
2695  template <typename _Tp>
2696  using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
2697 
2698  using _SuperImpl = typename _Abi::_MaskImpl;
2699  using _CommonImpl = typename _Abi::_CommonImpl;
2700 
2701  template <typename _Tp>
2702  static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
2703 
2704  // }}}
2705  // _S_broadcast {{{
2706  template <typename _Tp>
2707  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2708  _S_broadcast(bool __x)
2709  { return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); }
2710 
2711  // }}}
2712  // _S_load {{{
2713  template <typename _Tp>
2714  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2715  _S_load(const bool* __mem)
2716  {
2717  using _I = __int_for_sizeof_t<_Tp>;
2718  if (not __builtin_is_constant_evaluated())
2719  if constexpr (sizeof(_Tp) == sizeof(bool))
2720  {
2721  const auto __bools
2722  = _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem);
2723  // bool is {0, 1}, everything else is UB
2724  return __bools > 0;
2725  }
2726  return __generate_vector<_I, _S_size<_Tp>>(
2727  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2728  return __mem[__i] ? ~_I() : _I();
2729  });
2730  }
2731 
2732  // }}}
2733  // _S_convert {{{
2734  template <typename _Tp, size_t _Np, bool _Sanitized>
2735  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2736  _S_convert(_BitMask<_Np, _Sanitized> __x)
2737  {
2738  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2739  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_to_bits());
2740  else
2741  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2742  _S_size<_Tp>>(
2743  __x._M_sanitized());
2744  }
2745 
2746  template <typename _Tp, size_t _Np>
2747  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2748  _S_convert(_SimdWrapper<bool, _Np> __x)
2749  {
2750  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2751  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_data);
2752  else
2753  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2754  _S_size<_Tp>>(
2755  _BitMask<_Np>(__x._M_data)._M_sanitized());
2756  }
2757 
2758  template <typename _Tp, typename _Up, size_t _Np>
2759  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2760  _S_convert(_SimdWrapper<_Up, _Np> __x)
2761  {
2762  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2763  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(
2764  _SuperImpl::_S_to_bits(__x));
2765  else
2766  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2767  _S_size<_Tp>>(__x);
2768  }
2769 
2770  template <typename _Tp, typename _Up, typename _UAbi>
2771  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2772  _S_convert(simd_mask<_Up, _UAbi> __x)
2773  {
2774  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2775  {
2776  using _R = _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>;
2777  if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits
2778  return _R(__data(__x));
2779  else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits
2780  return _R(__data(__x));
2781  else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits
2782  return _R(__data(__x)._M_to_bits());
2783  else // vector -> bits
2784  return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
2785  }
2786  else
2787  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2788  _S_size<_Tp>>(
2789  __data(__x));
2790  }
2791 
2792  // }}}
2793  // _S_masked_load {{{2
2794  template <typename _Tp, size_t _Np>
2795  static inline _SimdWrapper<_Tp, _Np>
2796  _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
2797  _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
2798  {
2799  // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
2800  auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
2801  _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
2802  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2803  __tmp._M_set(__i, -__mem[__i]);
2804  });
2805  __merge = __wrapper_bitcast<_Tp>(__tmp);
2806  return __merge;
2807  }
2808 
2809  // _S_store {{{2
2810  template <typename _Tp, size_t _Np>
2811  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2812  _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
2813  {
2814  __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2815  __mem[__i] = __v[__i];
2816  });
2817  }
2818 
2819  // _S_masked_store {{{2
2820  template <typename _Tp, size_t _Np>
2821  static inline void
2822  _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
2823  const _SimdWrapper<_Tp, _Np> __k) noexcept
2824  {
2825  _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k),
2826  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2827  __mem[__i] = __v[__i];
2828  });
2829  }
2830 
2831  // _S_from_bitmask{{{2
2832  template <size_t _Np, typename _Tp>
2833  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2834  _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
2835  { return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); }
2836 
2837  // logical and bitwise operators {{{2
2838  template <typename _Tp, size_t _Np>
2839  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2840  _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2841  { return __and(__x._M_data, __y._M_data); }
2842 
2843  template <typename _Tp, size_t _Np>
2844  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2845  _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2846  { return __or(__x._M_data, __y._M_data); }
2847 
2848  template <typename _Tp, size_t _Np>
2849  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2850  _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
2851  {
2852  if constexpr (_Abi::template _S_is_partial<_Tp>)
2853  return __andnot(__x, __wrapper_bitcast<_Tp>(
2854  _Abi::template _S_implicit_mask<_Tp>()));
2855  else
2856  return __not(__x._M_data);
2857  }
2858 
2859  template <typename _Tp, size_t _Np>
2860  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2861  _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2862  { return __and(__x._M_data, __y._M_data); }
2863 
2864  template <typename _Tp, size_t _Np>
2865  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2866  _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2867  { return __or(__x._M_data, __y._M_data); }
2868 
2869  template <typename _Tp, size_t _Np>
2870  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2871  _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2872  { return __xor(__x._M_data, __y._M_data); }
2873 
2874  // smart_reference access {{{2
2875  template <typename _Tp, size_t _Np>
2876  static constexpr void
2877  _S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, bool __x) noexcept
2878  {
2879  if constexpr (is_same_v<_Tp, bool>)
2880  __k._M_set(__i, __x);
2881  else
2882  {
2883  static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
2884  if (__builtin_is_constant_evaluated())
2885  {
2886  __k = __generate_from_n_evaluations<_Np,
2887  __vector_type_t<_Tp, _Np>>(
2888  [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2889  if (__i == static_cast<int>(__j))
2890  return _Tp(-__x);
2891  else
2892  return __k[+__j];
2893  });
2894  }
2895  else
2896  __k._M_data[__i] = -__x;
2897  }
2898  }
2899 
2900  // _S_masked_assign{{{2
2901  template <typename _Tp, size_t _Np>
2902  _GLIBCXX_SIMD_INTRINSIC static void
2903  _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2904  __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2905  { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
2906 
2907  template <typename _Tp, size_t _Np>
2908  _GLIBCXX_SIMD_INTRINSIC static void
2909  _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs)
2910  {
2911  if (__builtin_constant_p(__rhs))
2912  {
2913  if (__rhs == false)
2914  __lhs = __andnot(__k, __lhs);
2915  else
2916  __lhs = __or(__k, __lhs);
2917  return;
2918  }
2919  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2920  __data(simd_mask<_Tp, _Abi>(__rhs)));
2921  }
2922 
2923  //}}}2
2924  // _S_all_of {{{
2925  template <typename _Tp>
2926  _GLIBCXX_SIMD_INTRINSIC static bool
2927  _S_all_of(simd_mask<_Tp, _Abi> __k)
2928  {
2929  return __call_with_subscripts(
2930  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2931  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2932  { return (... && !(__ent == 0)); });
2933  }
2934 
2935  // }}}
2936  // _S_any_of {{{
2937  template <typename _Tp>
2938  _GLIBCXX_SIMD_INTRINSIC static bool
2939  _S_any_of(simd_mask<_Tp, _Abi> __k)
2940  {
2941  return __call_with_subscripts(
2942  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2943  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2944  { return (... || !(__ent == 0)); });
2945  }
2946 
2947  // }}}
2948  // _S_none_of {{{
2949  template <typename _Tp>
2950  _GLIBCXX_SIMD_INTRINSIC static bool
2951  _S_none_of(simd_mask<_Tp, _Abi> __k)
2952  {
2953  return __call_with_subscripts(
2954  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2955  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2956  { return (... && (__ent == 0)); });
2957  }
2958 
2959  // }}}
2960  // _S_some_of {{{
2961  template <typename _Tp>
2962  _GLIBCXX_SIMD_INTRINSIC static bool
2963  _S_some_of(simd_mask<_Tp, _Abi> __k)
2964  {
2965  const int __n_true = _SuperImpl::_S_popcount(__k);
2966  return __n_true > 0 && __n_true < int(_S_size<_Tp>);
2967  }
2968 
2969  // }}}
2970  // _S_popcount {{{
2971  template <typename _Tp>
2972  _GLIBCXX_SIMD_INTRINSIC static int
2973  _S_popcount(simd_mask<_Tp, _Abi> __k)
2974  {
2975  using _I = __int_for_sizeof_t<_Tp>;
2976  if constexpr (is_default_constructible_v<simd<_I, _Abi>>)
2977  return -reduce(
2978  simd<_I, _Abi>(__private_init, __wrapper_bitcast<_I>(__data(__k))));
2979  else
2980  return -reduce(__bit_cast<rebind_simd_t<_I, simd<_Tp, _Abi>>>(
2981  simd<_Tp, _Abi>(__private_init, __data(__k))));
2982  }
2983 
2984  // }}}
2985  // _S_find_first_set {{{
2986  template <typename _Tp>
2987  _GLIBCXX_SIMD_INTRINSIC static int
2988  _S_find_first_set(simd_mask<_Tp, _Abi> __k)
2989  { return std::__countr_zero(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()); }
2990 
2991  // }}}
2992  // _S_find_last_set {{{
2993  template <typename _Tp>
2994  _GLIBCXX_SIMD_INTRINSIC static int
2995  _S_find_last_set(simd_mask<_Tp, _Abi> __k)
2996  { return std::__bit_width(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()) - 1; }
2997 
2998  // }}}
2999  };
3000 
3001 //}}}1
3002 _GLIBCXX_SIMD_END_NAMESPACE
3003 #endif // __cplusplus >= 201703L
3004 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
3005 
3006 // vim: foldmethod=marker foldmarker={{{,}}} sw=2 noet ts=8 sts=2 tw=100
complex< _Tp > log10(const complex< _Tp > &)
Return complex base 10 logarithm of z.
Definition: complex:1090
complex< _Tp > sin(const complex< _Tp > &)
Return complex sine of z.
Definition: complex:1120
complex< _Tp > log(const complex< _Tp > &)
Return complex natural logarithm of z.
Definition: complex:1085
complex< _Tp > tan(const complex< _Tp > &)
Return complex tangent of z.
Definition: complex:1221
complex< _Tp > exp(const complex< _Tp > &)
Return complex base e exponential of z.
Definition: complex:1058
complex< _Tp > cosh(const complex< _Tp > &)
Return complex hyperbolic cosine of z.
Definition: complex:1032
complex< _Tp > tanh(const complex< _Tp > &)
Return complex hyperbolic tangent of z.
Definition: complex:1249
complex< _Tp > pow(const complex< _Tp > &, int)
Return x to the y'th power.
Definition: complex:1280
complex< _Tp > sinh(const complex< _Tp > &)
Return complex hyperbolic sine of z.
Definition: complex:1150
complex< _Tp > cos(const complex< _Tp > &)
Return complex cosine of z.
Definition: complex:1002
complex< _Tp > sqrt(const complex< _Tp > &)
Return complex square root of z.
Definition: complex:1194
integral_constant< bool, true > true_type
The type used as a compile-time boolean with true value.
Definition: type_traits:82
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition: type_traits:2614
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:233
constexpr _Tp reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOperation __binary_op)
Calculate reduction of values in a range.
Definition: numeric:287
_Tp fabs(const std::complex< _Tp > &)
fabs(__z) [8.1.8].
Definition: complex:2424
std::complex< _Tp > asinh(const std::complex< _Tp > &)
asinh(__z) [8.1.6].
Definition: complex:2371
std::complex< _Tp > atan(const std::complex< _Tp > &)
atan(__z) [8.1.4].
Definition: complex:2296
make_integer_sequence< size_t, _Num > make_index_sequence
Alias template make_index_sequence.
Definition: utility.h:188
std::complex< _Tp > atanh(const std::complex< _Tp > &)
atanh(__z) [8.1.7].
Definition: complex:2415
std::complex< _Tp > acosh(const std::complex< _Tp > &)
acosh(__z) [8.1.5].
Definition: complex:2332
std::complex< _Tp > acos(const std::complex< _Tp > &)
acos(__z) [8.1.2].
Definition: complex:2216
std::complex< _Tp > asin(const std::complex< _Tp > &)
asin(__z) [8.1.3].
Definition: complex:2252