92#ifndef VSPLINE_HWY_SIMD_TYPE_H
93#define VSPLINE_HWY_SIMD_TYPE_H
100#ifndef HWY_SIMD_TYPE_H
101#define HWY_SIMD_TYPE_H
103#include <hwy/highway.h>
104#include <hwy/contrib/math/math-inl.h>
105#include <hwy/aligned_allocator.h>
106#include <hwy/print-inl.h>
107#ifndef HWY_HAVE_ATAN2
150template <
typename D , std::
size_t _vsize >
153 typedef typename hn::TFromD < D >
T ;
154 typedef typename hn::Vec < D >
vec_t ;
157 static const std::size_t
vsize = _vsize ;
169 static const std::size_t mask_bytes =
vsize ;
173 HWY_ALIGN uint8_t inner [ mask_bytes ] ;
180#ifdef HWY_HAVE_SCALABLE
181 std::size_t L()
const
183 return Lanes ( D() ) ;
186 static constexpr std::size_t
L()
188 return Lanes ( D() ) ;
220 return hn::LoadMaskBits( D() , inner + i * L() ) ;
225 hn::StoreMaskBits( D() , rhs , inner + i * L() ) ;
235 std::size_t n_lanes = Lanes ( D() ) ;
236 std::size_t n_mask =
vsize / n_lanes ;
237 for ( std::size_t i = 0 ; i < n_mask ; i++ )
239 std::size_t ofs = i * n_lanes ;
241 for ( std::size_t k = 0 ; k < n_lanes ; k++ )
243 std::size_t
byte = k / 8 ;
244 if ( inner [ ofs +
byte ] & bit )
260 std::size_t n_lanes = Lanes ( D() ) ;
261 std::size_t n_mask =
vsize / n_lanes ;
262 for ( std::size_t i = 0 ; i < n_mask ; i++ )
264 std::size_t ofs = i * n_lanes ;
266 for ( std::size_t k = 0 ; k < n_lanes ; k++ )
268 std::size_t
byte = k / 8 ;
270 inner [ ofs + byte ] |= bit ;
272 inner [ ofs + byte ] &= ~bit ;
288 template <
typename D1 ,
typename D2 >
292 const std::size_t in_n_lanes = Lanes ( D1() ) ;
293 const std::size_t out_n_lanes = Lanes ( D2() ) ;
294 std::size_t in_m = 0 ;
295 std::size_t out_m = 0 ;
296 std::size_t in_ofs = 0 ;
297 std::size_t out_ofs = 0 ;
298 std::size_t in_l = 0 ;
299 std::size_t out_l = 0 ;
301 uint8_t out_bit = 1 ;
303 const uint8_t * p_in = in_mask.
data() ;
304 uint8_t * p_out = out_mask.
data() ;
306 for ( std::size_t e = 0 ; e <
vsize ; e++ )
308 if ( p_in [ in_ofs ] & in_bit )
309 p_out [ out_ofs ] |= out_bit ;
311 p_out [ out_ofs ] &= ~out_bit ;
313 if ( ++in_l == in_n_lanes )
317 in_ofs = in_m * in_n_lanes ;
330 if ( ++out_l == out_n_lanes )
334 out_ofs = out_m * out_n_lanes ;
360 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
370 #define OPEQ_FUNC(OP,OPFN) \
371 mchunk_t & OP ( const mchunk_t & rhs ) \
373 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += L() ) \
374 take ( i , OPFN ( yield ( i ) , rhs.yield ( i ) ) ) ; \
394 template <
typename D2 >
399 const auto & trhs (
reinterpret_cast < const mchunk_t &
> ( rhs ) ) ;
403 template <
typename D2 >
404 void _assign (
const mchunk_t < D2 , vsize > & rhs , std::false_type )
410 transfer ( rhs , *
this ) ;
421 template <
typename D2 >
425 std::conditional <
sizeof (
T ) ==
sizeof ( hn::TFromD < D2 > ) ,
427 std::false_type > :: type eq_t ;
429 _assign ( rhs , eq_t() ) ;
435 template <
typename D2 >
444 #define OP_FUNC(OPFUNC,OPEQ) \
445 template < typename D2 > \
446 mchunk_t OPFUNC ( const mchunk_t < D2 , vsize > & rhs ) const \
448 mchunk_t help ( *this ) ; \
464 #define OP_FUNC(OPFUNC,OP) \
465 mchunk_t OPFUNC() const \
468 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += L() ) \
469 help.take ( i , OP ( yield ( i ) ) ) ; \
484 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
487 result &= hn::AllFalse ( D() , help ) ;
501 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
504 result &= hn::AllTrue ( D() , help ) ;
511 friend std::ostream & operator<< ( std::ostream & osr ,
514 uint8_t buffer [
vsize ] ;
517 for ( std::size_t i = 0 ; i <
vsize ; i++ )
518 osr << ( buffer [ i ] ?
"1" :
"0" ) ;
532template <
typename D , std::
size_t N >
538template <
typename D , std::
size_t N >
544template <
typename D , std::
size_t N >
582template <
typename _value_type ,
600template <
typename src_t ,
typename trg_t , std::
size_t vsize >
604 auto p_src = src.
data() ;
605 auto p_trg = trg.
data() ;
606 for ( std::size_t i = 0 ; i <
vsize ; i++ )
607 p_trg[i] = p_src[i] ;
614#define PROMOTE(SRC,TRG) \
615template < std::size_t vsize > \
616void convert ( const hwy_simd_type < SRC , vsize > & src , \
617 hwy_simd_type < TRG , vsize > & trg ) \
619 typedef hn::CappedTag < TRG , vsize > D ; \
620 typedef hn::Rebind < SRC , D > ud_t ; \
622 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += trg.L() ) \
624 auto underfilled = src.template dt_yield < ud_t > ( i ) ; \
625 auto promoted = hn::PromoteTo ( D() , underfilled ) ; \
626 trg.take ( i , promoted ) ; \
630#define CONVERT(SRC,TRG) \
631template < std::size_t vsize > \
632void convert ( const hwy_simd_type < SRC , vsize > & src , \
633 hwy_simd_type < TRG , vsize > & trg ) \
635 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += trg.L() ) \
636 trg.take ( i , hn::ConvertTo ( hn::CappedTag < TRG , vsize > () , \
637 src.yield ( i ) ) ) ; \
640#define DEMOTE(SRC,TRG) \
641template < std::size_t vsize > \
642void convert ( const hwy_simd_type < SRC , vsize > & src , \
643 hwy_simd_type < TRG , vsize > & trg ) \
645 typedef hn::CappedTag < SRC , vsize > X ; \
646 typedef hn::Rebind < TRG , X > ud_t ; \
648 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += hn::Lanes ( ud_t() ) ) \
650 const auto demoted = hn::DemoteTo ( ud_t() , src.yield ( i ) ) ; \
651 trg.template dt_take < ud_t > ( i , demoted ) ; \
684PROMOTE(
unsigned short,
unsigned int)
685PROMOTE(
unsigned int,
unsigned long)
688PROMOTE(
unsigned char,
unsigned short)
689PROMOTE(
unsigned char,
unsigned int)
698DEMOTE(
short,
unsigned char)
706DEMOTE(
long,
unsigned short)
709DEMOTE(
unsigned short,
signed char)
710DEMOTE(
unsigned short,
unsigned char)
712DEMOTE(
unsigned int,
signed char)
713DEMOTE(
unsigned int,
unsigned short)
714DEMOTE(
unsigned int,
unsigned char)
715DEMOTE(
unsigned long,
short)
717DEMOTE(
unsigned long,
signed char)
718DEMOTE(
unsigned long,
unsigned short)
719DEMOTE(
unsigned long,
unsigned int)
720DEMOTE(
unsigned long,
unsigned char)
739template <
typename T , std::
size_t vsize >
743 static_assert ( std::is_integral < T > :: value ,
"int only...!" ) ;
749template <
typename T , std::
size_t vsize >
753 static_assert ( std::is_integral < T > :: value ,
"int only...!" ) ;
761template <
typename T , std::
size_t vsize >
768template <
typename T , std::
size_t vsize >
779template <
typename src_t ,
typename trg_t , std::
size_t vsize >
783 auto p_trg = trg.
data() ;
784 for ( std::size_t i = 0 ; i <
vsize ; i++ )
788template <
typename src_t ,
typename trg_t , std::
size_t vsize >
792 auto p_src = src.
data() ;
793 for ( std::size_t i = 0 ; i <
vsize ; i++ )
801template <
typename _value_type ,
816 static const int ivsize = _vsize ;
823 static_assert ( (
vsize & (
vsize - 1 ) ) == 0 ,
824 "only use powers of two as lane count for highway-based hwy_simd_type" ) ;
832 typedef hn::CappedTag < value_type , vsize >
D ;
833 typedef typename hn::Vec < D >
vec_t ;
839 std::cout <<
"value_type has "
840 <<
sizeof(
value_type) <<
" bytes" << std::endl ;
841 std::cout <<
"hwy_simd_type has "
843 std::cout <<
"HWY_MAX_BYTES: "
844 << HWY_MAX_BYTES <<
" bytes" << std::endl ;
845 std::cout <<
"Lane count is "
846 << L() << std::endl ;
847 std::cout <<
"MaxLanes is "
848 << MaxLanes(
D()) << std::endl ;
849 std::cout <<
"vsize is "
850 <<
vsize <<
" value_type" << std::endl ;
853#ifdef HWY_HAVE_SCALABLE
855 std::size_t L()
const
857 return Lanes ( D() ) ;
862 static constexpr std::size_t
L()
864 return Lanes (
D() ) ;
872 typedef hn::DFromV < hw_index_type >
DI ;
873 typedef hn::TFromD < DI >
TI ;
879 "index type mismatch" ) ;
929 return hn::Load (
D() , inner + i * L() ) ;
934 hn::Store ( rhs ,
D() , inner + i * L() ) ;
942 template <
typename DT >
943 hn::Vec < DT >
dt_yield (
const std::size_t & i )
const
945 return hn::Load ( DT() , inner + i * Lanes ( DT() ) ) ;
948 template <
typename DT >
949 void dt_take (
const std::size_t & i ,
const hn::Vec < DT > & rhs )
951 hn::Store ( rhs , DT() , inner + i * Lanes ( DT() ) ) ;
1032 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1033 take ( i , hn::Set (
D() , x ) ) ;
1042 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1043 take ( i , rhs.
yield ( i ) ) ;
1049 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1050 take ( i , rhs.
yield ( i ) ) ;
1056 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1057 take ( i , hn::Set (
D() , rhs ) ) ;
1091 template <
typename U >
1100 template <
typename U >
1109 template <
typename U >
1117 template <
typename U >
1123 template <
typename U >
1142 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += value.
L() )
1144 value.
take ( i , hn::CopySign ( value.
yield ( i ) ,
1145 sign_source.
yield ( i ) ) ) ;
1160 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += result.
L() )
1162 result.
take ( i , hn::IsFinite ( rhs.
yield ( i ) ) ) ;
1170 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += result.
L() )
1172 result.
take ( i , hn::IsNaN ( rhs.
yield ( i ) ) ) ;
1179 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1181 take ( i , IfThenElse ( m.
yield ( i ) ,
1190 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1192 take ( i , IfThenElse ( m.
yield ( i ) ,
1204 auto v = hn::Iota (
D() , 0 ) ;
1205 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += result.
L() )
1207 result.
take ( i , v ) ;
1208 v = hn::Add ( v , hn::Set (
D() ,
T ( Lanes (
D() ) ) ) ) ;
1221 static const IT ceiling = std::numeric_limits < IT > :: max() ;
1222 static_assert ( (
vsize - 1 ) <= std::size_t ( ceiling ) ,
1223 "value_type too small" ) ;
1225 return index_type::iota() ;
1236 const std::size_t & step = 1 )
1239 static const IT ceiling = std::numeric_limits < IT > :: max() ;
1241 assert ( start + (
vsize - 1 ) * step
1242 <= std::size_t ( ceiling ) ) ;
1244 return ( index_type::iota() * IT(step) ) + IT(start) ;
1264 friend std::ostream & operator<< ( std::ostream & osr ,
1267 std::size_t l = it.
L() ;
1270 osr << it [ i ] << ( i % l == l - 1 ?
" | " :
", " ) ;
1271 osr << it [
vsize - 1 ] <<
")" ;
1275 friend std::istream & operator>> ( std::istream & isr ,
1294 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1295 take ( i , hn::LoadU (
D() , p_src + i * L() ) ) ;
1300 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1301 take ( i , hn::Load (
D() , p_src + i * L() ) ) ;
1308 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1309 hn::StoreU ( yield ( i ) ,
D() , p_trg + i * Lanes (
D() ) ) ;
1314 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1315 hn::Store ( yield ( i ) ,
D() , p_trg + i * Lanes (
D() ) ) ;
1325 void _gather (
const value_type *
const & p_src ,
1326 const index_type & indexes ,
1329 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1330 take ( i , hn::GatherIndex ( D() , p_src ,
1331 indexes.yield ( i ) ) ) ;
1334 void _scatter ( value_type *
const & p_trg ,
1335 const index_type & indexes ,
1336 std::false_type )
const
1338 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1339 hn::ScatterIndex ( yield ( i ) , D() , p_trg ,
1340 indexes.yield ( i ) ) ;
1345 void _gather (
const value_type *
const & p_src ,
1346 const index_type & indexes ,
1349 for ( std::size_t i = 0 ; i <
vsize ; i++ )
1350 inner [ i ] = p_src [ indexes [ i ] ] ;
1353 void _scatter ( value_type *
const & p_trg ,
1354 const index_type & indexes ,
1355 std::true_type )
const
1357 for ( std::size_t i = 0 ; i <
vsize ; i++ )
1358 p_trg [ indexes [ i ] ] = inner [ i ] ;
1363 void gather (
const value_type *
const & p_src ,
1364 const index_type & indexes )
1366 typedef std::integral_constant
1367 < bool ,
sizeof ( value_type ) <= 2 > is_small_t ;
1369 _gather ( p_src , indexes , std::false_type() ) ;
1372 void scatter ( value_type *
const & p_trg ,
1373 const index_type & indexes )
const
1375 typedef std::integral_constant
1376 < bool ,
sizeof ( value_type ) <= 2 > is_small_t ;
1378 _scatter ( p_trg , indexes , is_small_t() ) ;
1396 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1397 take ( i , hn::GatherIndex (
D() , p_src ,
1398 indexes.
yield ( i ) ) ) ;
1404 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1405 hn::ScatterIndex ( yield ( i ) ,
D() , p_trg ,
1406 indexes.
yield ( i ) ) ;
1417 template <
typename index_t >
1419 const index_t & indexes )
1421 for ( std::size_t i = 0 ; i <
vsize ; i++ )
1422 inner [ i ] = p_src [ indexes [ i ] ] ;
1425 template <
typename index_t >
1427 const index_t & indexes )
const
1429 for ( std::size_t i = 0 ; i <
vsize ; i++ )
1430 p_trg [ indexes [ i ] ] = inner [ i ] ;
1435 template <
typename index_t >
1437 const index_t & indexes )
1439 gather ( p_src , indexes ) ;
1451 const std::size_t & step )
1453 auto indexes = IndexesFrom ( 0 , step ) ;
1454 gather ( p_src , indexes ) ;
1458 const std::size_t & step )
const
1460 auto indexes = IndexesFrom ( 0 , step ) ;
1461 scatter ( p_trg , indexes ) ;
1484 #define BROADCAST_HWY_FUNC(FUNC,HFUNC) \
1485 friend hwy_simd_type FUNC ( const hwy_simd_type & arg ) \
1487 hwy_simd_type result ; \
1488 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += arg.L() ) \
1489 result.take ( i , hn::HFUNC ( D() , arg.yield ( i ) ) ) ; \
1505 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += arg.L() )
1507 hn::Div ( hn::Sin (
D() , arg.yield ( i ) ) ,
1508 hn::Cos (
D() , arg.yield ( i ) ) ) ) ;
1525 #undef BROADCAST_HWY_FUNC
1527 #define BROADCAST_HWY_FUNC(FUNC,HFUNC) \
1528 friend hwy_simd_type FUNC ( const hwy_simd_type & arg ) \
1530 hwy_simd_type result ; \
1531 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += arg.L() ) \
1532 result.take ( i , hn::HFUNC ( arg.yield ( i ) ) ) ; \
1543 #undef BROADCAST_HWY_FUNC
1545 #define BROADCAST_HWY_FUNC2(FUNC,HFUNC) \
1546 friend hwy_simd_type FUNC ( const hwy_simd_type & arg1 , \
1547 const hwy_simd_type & arg2 ) \
1549 hwy_simd_type result ; \
1550 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += arg1.L() ) \
1551 result.take ( i , hn::HFUNC ( arg1.yield ( i ) , \
1552 arg2.yield ( i ) ) ) ; \
1566 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += x.
L() )
1583 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += base.
L() )
1586 hn::Mul ( exponent.
yield ( i ) ,
1587 hn::Log (
D() , base.
yield ( i ) ) )
1593#undef BROADCAST_HWY_FUNC2
1606 #define INTEGRAL_ONLY \
1607 static_assert ( std::is_integral < value_type > :: value , \
1608 "this operation is only allowed for integral types" ) ;
1618 #define OPEQ_FUNC(OP,OPFN,CONSTRAINT) \
1619 hwy_simd_type & OP ( const hwy_simd_type & rhs ) \
1621 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += L() ) \
1622 take ( i , OPFN ( yield ( i ) , rhs.yield ( i ) ) ) ; \
1625 hwy_simd_type & OP ( const T & rhs ) \
1627 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += L() ) \
1628 take ( i , OPFN ( yield ( i ) , hn::Set ( D() , rhs ) ) ) ; \
1654 template <
typename rhs_t >
1657 auto * p_r = rhs.
data() ;
1658 for ( std::size_t i = 0 ; i < size() ; i++ )
1659 inner [ i ] /= p_r [ i ] ;
1665 for ( std::size_t i = 0 ; i < size() ; i++ )
1666 inner [ i ] /= rhs ;
1672 template <
typename rhs_t >
1675 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1676 take ( i , hn::Div ( yield ( i ) , rhs.yield ( i ) ) ) ;
1682 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
1683 take ( i , hn::Div ( yield ( i ) , hn::Set (
D() , rhs ) ) ) ;
1687 template <
typename rhs_t >
1690 typedef typename std::is_floating_point < T > :: type is_float_t ;
1691 return div ( rhs , is_float_t() ) ;
1696 template <
typename rhs_t >
1699 auto * p_r = rhs.
data() ;
1700 for ( std::size_t i = 0 ; i < size() ; i++ )
1701 inner [ i ] %= p_r [ i ] ;
1707 auto * p_r = rhs.
data() ;
1708 for ( std::size_t i = 0 ; i < size() ; i++ )
1709 inner [ i ] %= rhs ;
1720#define C_PROMOTE(A,B) \
1721typename std::conditional \
1722 < std::is_same < A , B > :: value , \
1724 decltype ( std::declval < A > () \
1725 + std::declval < B > () ) \
1731#define OP_FUNC(OPFUNC,OPEQ,CONSTRAINT) \
1732 template < typename RHST , \
1733 typename = typename std::enable_if \
1734 < std::is_fundamental < RHST > :: value \
1737 hwy_simd_type < C_PROMOTE ( T , RHST ) , vsize > \
1738 OPFUNC ( hwy_simd_type < RHST , vsize > rhs ) const \
1741 hwy_simd_type < C_PROMOTE ( T , RHST ) , vsize > help ( *this ) ; \
1745 template < typename RHST , \
1746 typename = typename std::enable_if \
1747 < std::is_fundamental < RHST > :: value \
1750 hwy_simd_type < C_PROMOTE ( T , RHST ) , vsize > \
1751 OPFUNC ( RHST rhs ) const \
1754 hwy_simd_type < C_PROMOTE ( T , RHST ) , vsize > help ( *this ) ; \
1758 template < typename LHST , \
1759 typename = typename std::enable_if \
1760 < std::is_fundamental < LHST > :: value \
1763 friend hwy_simd_type < C_PROMOTE ( LHST , T ) , vsize > \
1764 OPFUNC ( LHST lhs , hwy_simd_type rhs ) \
1767 hwy_simd_type < C_PROMOTE ( LHST , T ) , vsize > help ( lhs ) ; \
1816 #define OP_FUNC(OPFUNC,OP,CONSTRAINT) \
1817 hwy_simd_type OPFUNC() const \
1819 hwy_simd_type help ; \
1820 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += L() ) \
1821 help.take ( i , OP ( yield ( i ) ) ) ; \
1833 #define COMPARE_FUNC(OP,OPFUNC) \
1834 friend mask_type OPFUNC ( const hwy_simd_type & lhs , \
1835 const hwy_simd_type & rhs ) \
1838 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += lhs.L() ) \
1839 m.take ( i , OP ( lhs.yield ( i ) , rhs.yield ( i ) ) ) ; \
1842 friend mask_type OPFUNC ( const hwy_simd_type & lhs , \
1843 const value_type & rhs ) \
1846 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += lhs.L() ) \
1847 m.take ( i , OP ( lhs.yield ( i ) , hn::Set ( D() , rhs ) ) ) ; \
1850 friend mask_type OPFUNC ( const value_type & lhs , \
1851 const hwy_simd_type & rhs ) \
1854 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += rhs.L() ) \
1855 m.take ( i , OP ( hn::Set ( D() , lhs ) , rhs.yield ( i ) ) ) ; \
1886 std::size_t
L()
const
1888 return whither.
L() ;
1893 : whether ( _whether ) ,
1894 whither ( _whither )
1897 template <
typename D2 , std::
size_t N2 >
1900 : whether ( _whether ) ,
1901 whither ( _whither )
1910 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() ) \
1912 auto m = whether.
yield ( i ) ; \
1913 auto v = whither.
yield ( i ) ; \
1914 auto vr = hn::Set (
D() , rhs ) ; \
1915 whither.take ( i , hn::IfThenElse ( m , vr , v ) ) ; \
1921 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() ) \
1923 auto m = whether.
yield ( i ) ; \
1924 auto v = whither.
yield ( i ) ; \
1925 auto vr = rhs.
yield ( i ) ; \
1926 whither.take ( i , hn::IfThenElse ( m , vr , v ) ) ; \
1933 #define OPEQ_FUNC(OPFUNC,OP,CONSTRAINT) \
1934 hwy_simd_type & OPFUNC ( const value_type & rhs ) \
1937 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += L() ) \
1939 auto m = whether.yield ( i ) ; \
1940 auto v = whither.yield ( i ) ; \
1941 auto vr = hn::OP ( v , hn::Set ( D() , rhs ) ) ; \
1942 whither.take ( i , hn::IfThenElse ( m , vr , v ) ) ; \
1946 hwy_simd_type & OPFUNC ( const hwy_simd_type & rhs ) \
1949 for ( std::size_t n = 0 , i = 0 ; n < vsize ; ++i , n += L() ) \
1951 auto m = whether.yield ( i ) ; \
1952 auto v = whither.yield ( i ) ; \
1953 auto vr = hn::OP ( v , rhs.yield ( i ) ) ; \
1954 whither.take ( i , hn::IfThenElse ( m , vr , v ) ) ; \
1977 #define OPEQ_FUNC(OPFUNC,OPEQ,CONSTRAINT) \
1978 hwy_simd_type & OPFUNC ( const value_type & rhs ) \
1981 hwy_simd_type mrhs ( whither ) ; \
1983 return ( *this = mrhs ) ; \
1985 hwy_simd_type & OPFUNC ( const hwy_simd_type & rhs ) \
1988 hwy_simd_type mrhs ( whither ) ; \
1990 return ( *this = mrhs ) ; \
2000 #undef INTEGRAL_ONLY
2020 #define CLAMP(FNAME,REL) \
2021 hwy_simd_type FNAME ( const T & threshold ) const \
2023 return (*this) ( *this REL threshold ) = threshold ; \
2025 hwy_simd_type FNAME ( const hwy_simd_type & threshold ) const \
2027 return (*this) ( *this REL threshold ) = threshold ; \
2041 vec_t s ( hn::Zero (
D() ) ) ;
2042 for ( std::size_t n = 0 , i = 0 ; n <
vsize ; ++i , n += L() )
2044 return hn::GetLane ( hn::SumOfLanes (
D() , s ) ) ;
2056template <
typename T , std::
size_t N >
2065template <
typename T >
2067:
public std::allocator < T >
2070 using typename base_t::pointer ;
2073 return (pointer) hwy::AllocateAlignedBytes
2074 ( n *
sizeof(T) ,
nullptr ,
nullptr ) ;
2078 hwy::FreeAlignedBytes ( p ,
nullptr ,
nullptr ) ;
2080 using base_t::base_t ;
2089template <
typename T , std::
size_t N >
2097#ifndef HWY_SIMD_ALLOCATOR
2098#define HWY_SIMD_ALLOCATOR
2103template <
typename T , std::
size_t N >
2114#ifndef VSPLINE_VECTOR_NBYTES
2128#define VSPLINE_VECTOR_NBYTES (4*HWY_MAX_BYTES)
class template simd_type provides a fixed-size container type for small sets of fundamentals which ar...
void load(const value_type *const p_src)
void store(value_type *const p_trg) const
#define OP_FUNC(OPFUNC, OPEQ)
#define BROADCAST_HWY_FUNC(FUNC, HFUNC)
#define OPEQ_FUNC(OP, OPFN)
#define CONVERT(SRC, TRG)
#define CLAMP(FNAME, REL)
#define PROMOTE(SRC, TRG)
void convert(const hwy_simd_type< src_t, vsize > &src, hwy_simd_type< trg_t, vsize > &trg)
bool none_of(const mchunk_t< D, N > &arg)
bool any_of(const mchunk_t< D, N > &arg)
bool all_of(const mchunk_t< D, N > &arg)
struct HWY_ALIGN hwy_simd_type
class template hwy_simd_type provides a fixed-size container type for small-ish sets of fundamentals ...
HWY_INLINE V Atan2(const D d, V y, V x)
HWY_INLINE V Atan(const D d, V y, V x)
masked_type(const mask_type &_whether, hwy_simd_type &_whither)
masked_type(const mchunk_t< D2, N2 > &_whether, hwy_simd_type &_whither)
hwy_simd_type(const vspline::simd_type< U, vsize > &rhs)
hwy_simd_type(const T &x)
static mask_type isnegative(const hwy_simd_type &rhs)
hwy_simd_type & setQnan(const mask_type &m)
void load_aligned(const value_type *const &p_src)
static constexpr std::size_t L()
friend hwy_simd_type pow(const hwy_simd_type &base, const hwy_simd_type &exponent)
void take(const std::size_t &i, const vec_t &rhs)
COMPARE_FUNC(Ne, operator!=)
void gather(const value_type *const &p_src, const index_type &indexes)
vec_t yield(const std::size_t &i) const
hwy_simd_type< TI, vsize > index_type
static const index_type IndexesFromZero()
static mask_type isnan(const hwy_simd_type &rhs)
COMPARE_FUNC(Le, operator<=)
static mask_type isfinite(const hwy_simd_type &rhs)
void dt_take(const std::size_t &i, const hn::Vec< DT > &rhs)
void rscatter(value_type *const &p_trg, const std::size_t &step) const
mchunk_t< D, vsize > MaskType
hn::CappedTag< value_type, vsize > D
hn::Vec< hn::RebindToSigned< D > > hw_index_type
hwy_simd_type(const hwy_simd_type &&x)
COMPARE_FUNC(Gt, operator>)
COMPARE_FUNC(Lt, operator<)
static const hwy_simd_type One()
void store(value_type *const p_trg) const
static const hwy_simd_type Zero()
hwy_simd_type & div(const T &rhs, std::true_type)
hn::Vec< DT > dt_yield(const std::size_t &i) const
void gather(const value_type *const &p_src, const index_t &indexes)
void scatter(value_type *const &p_trg, const index_type &indexes) const
hwy_simd_type & div(const rhs_t &rhs, std::false_type)
void scatter(value_type *const &p_trg, const index_t &indexes) const
hwy_simd_type & div(const T &rhs, std::false_type)
friend hwy_simd_type atan2(const hwy_simd_type &y, const hwy_simd_type &x)
mchunk_t< D, vsize > mask_type
hwy_simd_type & div(const rhs_t &rhs, std::true_type)
void store_aligned(value_type *const &p_trg) const
static constexpr size_type size()
static const index_type IndexesFrom(const std::size_t &start, const std::size_t &step=1)
void rgather(const value_type *const &p_src, const std::size_t &step)
COMPARE_FUNC(Ge, operator>=)
hwy_simd_type(const value_type *const &p_src, const index_t &indexes)
hwy_simd_type(const hwy_simd_type &x)
hwy_simd_type & setZero(const mask_type &m)
void load(const value_type *const &p_src)
static const hwy_simd_type iota()
hn::DFromV< hw_index_type > DI
hwy_simd_type(const hwy_simd_type< U, vsize > &rhs)
COMPARE_FUNC(Eq, operator==)
static hwy_simd_type copysign(hwy_simd_type value, const hwy_simd_type &sign_source)
mask type for hwy_simd_type. This is a type which holds a set of masks stored in uint8_t,...
void take(const std::size_t &i, const vmask_type &rhs)
vmask_type yield(const std::size_t &i) const
static constexpr std::size_t L()
const uint8_t * data() const
mchunk_t(const mchunk_t< D2, vsize > &rhs)
void LoadFromBytes(const uint8_t *p_trg)
void transfer(const mchunk_t< D1, vsize > &in_mask, mchunk_t< D2, vsize > &out_mask)
mchunk_t(const mchunk_t &)=default
void SaveToBytes(uint8_t *p_trg) const
vspline::simd_allocator< vspline::hwy_simd_type< T, N > > allocator_type
simd_allocator< hwy_simd_type< T, N > > type
vspline creates vigra::MultiArrays of vectorized types. As long as the vectorized types are Vc::SimdA...
void deallocate(T *p, std::size_t n)
std::allocator< T > base_t
pointer allocate(std::size_t n)