vspline 1.1.0
Generic C++11 Code for Uniform B-Splines
|
SIMD type using highway. More...
#include <iostream>
#include <functional>
#include <type_traits>
#include <assert.h>
#include <hwy/highway.h>
#include <hwy/contrib/math/math-inl.h>
#include <hwy/aligned_allocator.h>
#include <hwy/print-inl.h>
#include "hwy_atan2.h"
Go to the source code of this file.
Classes | |
struct | HWY_NAMESPACE::mchunk_t< D, _vsize > |
mask type for hwy_simd_type. This is a type which holds a set of masks stored in uint8_t, as the highway mask storing function provides. So this type is memory-backed, just like hwy_simd_type. Template arguments are the corresponding hwy_simd_type's tag type and it's lane count. highway is strict about which vectors and masks can interoperate, and only allows 'direct' interoperation if the types involved 'match' in size. Masks pertaining to vectors of differently-sized T aren't directly interoperable because they don't have the same lane count. One requires k masks of one type and k * 2 ^ i of the other. Here, we follow a different paradigm: The top-level objects we're dealing with have a fixed 'vsize', the number of lanes they hold. This should be a power of two. The paradigm is that objects with equal vsize should be interoperable, no matter what lane count the hardware vectors have which are used to implement their functionality. This makes user code simpler: users pick a vsize which they use for a body of code, all vector-like objects use the common vsize, and the implementation of the vector-like objects takes care of 'rolling out' the operations to hardware vectors. At times this produces what I call 'friction' - if the underlying hardware vectors and masks are not directly compatible, code is needed to interoperate them, and this code can at times be slow. So the recommendation for users is to avoid 'friction' by avoiding mixing differently-sized types, but with the given paradigm, this is a matter of performance tuning rather than imposing constraints on code structure. Some of the 'friction' might be mitigated by additional code using highway's up- and down-scaling routines, but for now the code rather uses 'goading' with small loops over the backing memory, relying on the compiler to handle this efficiently. More... | |
struct | HWY_NAMESPACE::hwy_simd_type< _value_type, _vsize > |
struct | HWY_NAMESPACE::hwy_simd_type< _value_type, _vsize >::masked_type |
struct | vspline::simd_allocator< T > |
struct | vspline::allocator_traits< hwy_simd_type< T, N > > |
struct | std::allocator_traits< vspline::hwy_simd_type< T, N > > |
Namespaces | |
namespace | HWY_NAMESPACE |
namespace | vspline |
namespace | std |
Macros | |
#define | HWY_SIMD_TYPE_H |
#define | OPEQ_FUNC(OP, OPFN) |
#define | OP_FUNC(OPFUNC, OPEQ) |
#define | OP_FUNC(OPFUNC, OP) |
#define | PROMOTE(SRC, TRG) |
#define | CONVERT(SRC, TRG) |
#define | DEMOTE(SRC, TRG) |
#define | BROADCAST_HWY_FUNC(FUNC, HFUNC) |
#define | BROADCAST_HWY_FUNC(FUNC, HFUNC) |
#define | BROADCAST_HWY_FUNC2(FUNC, HFUNC) |
#define | INTEGRAL_ONLY |
#define | OPEQ_FUNC(OP, OPFN, CONSTRAINT) |
#define | C_PROMOTE(A, B) |
#define | OP_FUNC(OPFUNC, OPEQ, CONSTRAINT) |
#define | OP_FUNC(OPFUNC, OP, CONSTRAINT) |
#define | COMPARE_FUNC(OP, OPFUNC) |
#define | OPEQ_FUNC(OPFUNC, OP, CONSTRAINT) |
#define | OPEQ_FUNC(OPFUNC, OPEQ, CONSTRAINT) |
#define | CLAMP(FNAME, REL) |
#define | HWY_SIMD_ALLOCATOR |
#define | VSPLINE_VECTOR_NBYTES (4*HWY_MAX_BYTES) |
Typedefs | |
template<typename T , std::size_t N> | |
using | vspline::hwy_simd_type = HWY_NAMESPACE::hwy_simd_type< T, N > |
Functions | |
HWY_BEFORE_NAMESPACE () | |
template<typename D , std::size_t N> | |
bool | HWY_NAMESPACE::any_of (const mchunk_t< D, N > &arg) |
template<typename D , std::size_t N> | |
bool | HWY_NAMESPACE::all_of (const mchunk_t< D, N > &arg) |
template<typename D , std::size_t N> | |
bool | HWY_NAMESPACE::none_of (const mchunk_t< D, N > &arg) |
template<typename src_t , typename trg_t , std::size_t vsize> | |
void | HWY_NAMESPACE::convert (const hwy_simd_type< src_t, vsize > &src, hwy_simd_type< trg_t, vsize > &trg) |
template<typename T , std::size_t vsize> | |
void | HWY_NAMESPACE::convert (const hwy_simd_type< double, vsize > &src, hwy_simd_type< T, vsize > &trg) |
template<typename T , std::size_t vsize> | |
void | HWY_NAMESPACE::convert (const hwy_simd_type< T, vsize > &src, hwy_simd_type< double, vsize > &trg) |
template<typename T , std::size_t vsize> | |
void | HWY_NAMESPACE::convert (const vspline::simd_type< T, vsize > &src, hwy_simd_type< T, vsize > &trg) |
template<typename T , std::size_t vsize> | |
void | HWY_NAMESPACE::convert (const hwy_simd_type< T, vsize > &src, vspline::simd_type< T, vsize > &trg) |
template<typename src_t , typename trg_t , std::size_t vsize> | |
void | HWY_NAMESPACE::convert (const vspline::simd_type< src_t, vsize > &src, hwy_simd_type< trg_t, vsize > &trg) |
template<typename src_t , typename trg_t , std::size_t vsize> | |
void | HWY_NAMESPACE::convert (const hwy_simd_type< src_t, vsize > &src, vspline::simd_type< trg_t, vsize > &trg) |
HWY_AFTER_NAMESPACE () | |
Variables | |
template<typename _value_type , std::size_t _vsize> | |
struct HWY_ALIGN | HWY_NAMESPACE::hwy_simd_type |
class template hwy_simd_type provides a fixed-size container type for small-ish sets of fundamentals which are stored in a POD C vector. This implementation uses highway to code the loops more efficiently. It mimicks Vc::SimdArray, just like vspline::simd_type does, and The code is derived from vspline::simd_array, changing the workhorse code from simple loops to the use of highway functions. The resulting type, with it's 'container-typical' interface, slots in well with the higher-level constructs used in vspline/lux and, at the same time, 'contains' the SIMD implementation in this class, so that it's use doesn't need to be known outside. As an arithmetic type, hwy_simd_type provides many mathematical operators and some functions - most of them are realized by calling corresponding highway functions, but some (still) rely on loops, either because they aren't performance-critical or because there is no highway code to be had for the purpose. Some methods are (currently) exclusive to this class, but may be ported to other SIMD interface classes; apart from the original 'goading' class vspline::simd_type, there is also an implementation using std::simd in pv/vspline/std_simd_type.h The lane count for a hwy_simd_type in this body of code should be a power of two, and it should be at least as large as the hardware lane count of the smallest fundamental used in vectorized form. To cover all eventualities, the hardware lane count of a vector of unsigned char (uint8_t) is a good choice. This choice is to avoid that hwy_simd_type objects of small T remain partly empty when a given small vsize is chosen to cater for vectors with larger T. At times, this will lead to overly high register pressure, and the overall performance may benefit from allowing partially filled hwy_simd_type via a smaller vsize, which is feasible because hwy_simd_type uses highway vectors with CappedTag. More... | |
SIMD type using highway.
This is a new, tentative implementation of vspline::simd_type using highway (https://github.com/google/highway). highway provides code to work with hardware SIMD in a portable way, but it's still very close to the hardware, and does not provide support for vectors larger than the hardware's register size. vspline::simd_type, on the other hand, is a fixed-size construct which may well exceed the hardware size. The 'goading' implementation of vspline::simd_type uses small loops over a POD C vector to implement the functionality - hoping that the compiler will 'get it' and autovectorize the code. This implementation is also based on a POD C vector, but the functionality is implemented (wherever this seems feasible or sensible) by using highway SIMD code. In a way it's enforcing by explicit code what 'ordinary' vspline::simd_type hopes to get from the compiler via autovectorization, and since the compiler's 'insight' into the code is limited, the explicit approach tends to come out on top, producing SIMD binary more often (and in more efficient variants) than the goading approach.
Some of the functionality is implemented by simple goading routines. This is either because this is deemed acceptable (e.g. printing a hwy_simd_type to the console is not in any way time critical, nor can it benefit from SIMD code) - or because I haven't yet tackled writing 'proper' SIMD code for the functionality in question - for example, type conversions are still done with goading. This state of affairs also reflects my implementation strategy: I started out with the 'ordinary' vspline::simd_type and replaced more and more of the goading code by 'proper' SIMD code.
'Backing' the SIMD vectors like that is only one way of handling the SIMD types in the background, but has the advantage of, first, being compatible with the goading code (so one can 'go over the memory' or 'fall back to scalar') and, second, being general, so that both sized and sizeless vectors can be implemented with the same code. The disadvantage is that the compiler may not find all opportunities for keeping the SIMD code 'afloat' in a set of registers, but may at times resort to actually creating and using the underlying POD C array, rather than optimizing it away.
Nevertheless, this implementation seems to tend towards 'proper' SIMD code rather than towards the goading implementation - first tests showed the code took up to 30% longer than code done with Vc on an AVX2 machine, whereas the goading code often takes twice or thrice as long, so it seems to be a path worth persuing.
Definition in file hwy_simd_type.h.
#define BROADCAST_HWY_FUNC | ( | FUNC, | |
HFUNC | |||
) |
Definition at line 1527 of file hwy_simd_type.h.
#define BROADCAST_HWY_FUNC | ( | FUNC, | |
HFUNC | |||
) |
Definition at line 1527 of file hwy_simd_type.h.
#define BROADCAST_HWY_FUNC2 | ( | FUNC, | |
HFUNC | |||
) |
Definition at line 1545 of file hwy_simd_type.h.
#define C_PROMOTE | ( | A, | |
B | |||
) |
Definition at line 1720 of file hwy_simd_type.h.
#define CLAMP | ( | FNAME, | |
REL | |||
) |
Definition at line 2020 of file hwy_simd_type.h.
#define COMPARE_FUNC | ( | OP, | |
OPFUNC | |||
) |
Definition at line 1833 of file hwy_simd_type.h.
#define CONVERT | ( | SRC, | |
TRG | |||
) |
Definition at line 630 of file hwy_simd_type.h.
#define DEMOTE | ( | SRC, | |
TRG | |||
) |
Definition at line 640 of file hwy_simd_type.h.
#define HWY_SIMD_ALLOCATOR |
Definition at line 2098 of file hwy_simd_type.h.
#define HWY_SIMD_TYPE_H |
Definition at line 101 of file hwy_simd_type.h.
#define INTEGRAL_ONLY |
Definition at line 1606 of file hwy_simd_type.h.
#define OP_FUNC | ( | OPFUNC, | |
OP | |||
) |
Definition at line 1816 of file hwy_simd_type.h.
#define OP_FUNC | ( | OPFUNC, | |
OP, | |||
CONSTRAINT | |||
) |
Definition at line 1816 of file hwy_simd_type.h.
#define OP_FUNC | ( | OPFUNC, | |
OPEQ | |||
) |
Definition at line 1816 of file hwy_simd_type.h.
#define OP_FUNC | ( | OPFUNC, | |
OPEQ, | |||
CONSTRAINT | |||
) |
Definition at line 1816 of file hwy_simd_type.h.
#define OPEQ_FUNC | ( | OP, | |
OPFN | |||
) |
Definition at line 1977 of file hwy_simd_type.h.
#define OPEQ_FUNC | ( | OP, | |
OPFN, | |||
CONSTRAINT | |||
) |
Definition at line 1977 of file hwy_simd_type.h.
#define OPEQ_FUNC | ( | OPFUNC, | |
OP, | |||
CONSTRAINT | |||
) |
Definition at line 1977 of file hwy_simd_type.h.
#define OPEQ_FUNC | ( | OPFUNC, | |
OPEQ, | |||
CONSTRAINT | |||
) |
Definition at line 1977 of file hwy_simd_type.h.
#define PROMOTE | ( | SRC, | |
TRG | |||
) |
Definition at line 614 of file hwy_simd_type.h.
#define VSPLINE_VECTOR_NBYTES (4*HWY_MAX_BYTES) |
Definition at line 2128 of file hwy_simd_type.h.
HWY_AFTER_NAMESPACE | ( | ) |
HWY_BEFORE_NAMESPACE | ( | ) |