Template Numerical Library version\ main:8b8c8226
|
Namespace holding segments data structures. More...
Typedefs | |
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >> | |
using | CSRAdaptive = CSR< Device, Index, CSRAdaptiveKernel< Index, Device >, IndexAllocator > |
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >> | |
using | CSRDefault = CSRScalar< Device, Index, IndexAllocator > |
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >> | |
using | CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAllocator > |
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >> | |
using | CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator > |
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >> | |
using | CSRScalar = CSR< Device, Index, CSRScalarKernel< Index, Device >, IndexAllocator > |
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >> | |
using | CSRVector = CSR< Device, Index, CSRVectorKernel< Index, Device >, IndexAllocator > |
template<typename Device , typename Index > | |
using | CSRViewAdaptive = CSRView< Device, Index, CSRAdaptiveKernel< std::remove_const_t< Index >, Device > > |
template<typename Device , typename Index > | |
using | CSRViewDefault = CSRViewScalar< Device, Index > |
template<typename Device , typename Index , int ThreadsInBlock = 256> | |
using | CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< std::remove_const_t< Index >, Device, ThreadsInBlock > > |
template<typename Device , typename Index > | |
using | CSRViewLight = CSRView< Device, Index, CSRLightKernel< std::remove_const_t< Index >, Device > > |
template<typename Device , typename Index > | |
using | CSRViewScalar = CSRView< Device, Index, CSRScalarKernel< std::remove_const_t< Index >, Device > > |
template<typename Device , typename Index > | |
using | CSRViewVector = CSRView< Device, Index, CSRVectorKernel< std::remove_const_t< Index >, Device > > |
Enumerations | |
enum | ElementsOrganization { ColumnMajorOrder = 0 , RowMajorOrder } |
enum | EllpackKernelType { Scalar , Vector , Vector2 , Vector4 , Vector8 , Vector16 } |
enum | LightCSRSThreadsMapping { LightCSRConstantThreads , CSRLightAutomaticThreads , CSRLightAutomaticThreadsLightSpMV } |
Functions | |
template<typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real > | |
__global__ void | EllpackCudaReductionKernelCompact (Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize) |
template<typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real > | |
__global__ void | EllpackCudaReductionKernelFull (Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize) |
template<typename Device , typename Index , typename IndexAllocator , ElementsOrganization Organization, int WarpSize> | |
std::ostream & | operator<< (std::ostream &str, const BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize > &segments) |
template<typename Device , typename Index , ElementsOrganization Organization, int WarpSize> | |
std::ostream & | operator<< (std::ostream &str, const BiEllpackView< Device, Index, Organization, WarpSize > &segments) |
template<typename Device , typename Index , typename IndexAllocator , ElementsOrganization Organization> | |
std::ostream & | operator<< (std::ostream &str, const ChunkedEllpack< Device, Index, IndexAllocator, Organization > &segments) |
template<typename Device , typename Index , typename Kernel , typename IndexAllocator > | |
std::ostream & | operator<< (std::ostream &str, const CSR< Device, Index, Kernel, IndexAllocator > &segments) |
Insertion operator of CSR segments to output stream. More... | |
template<typename Device , typename Index , typename Kernel > | |
std::ostream & | operator<< (std::ostream &str, const CSRView< Device, Index, Kernel > &segments) |
template<typename Device , typename Index , typename IndexAllocator , ElementsOrganization Organization, int Alignment> | |
std::ostream & | operator<< (std::ostream &str, const Ellpack< Device, Index, IndexAllocator, Organization, Alignment > &segments) |
template<typename Device , typename Index , ElementsOrganization Organization, int Alignment> | |
std::ostream & | operator<< (std::ostream &str, const EllpackView< Device, Index, Organization, Alignment > &ellpack) |
template<typename Device , typename Index , typename IndexAllocator , ElementsOrganization Organization, int SliceSize> | |
std::ostream & | operator<< (std::ostream &str, const SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize > &segments) |
template<typename Device , typename Index , ElementsOrganization Organization, int SliceSize> | |
std::ostream & | operator<< (std::ostream &str, const SlicedEllpackView< Device, Index, Organization, SliceSize > &segments) |
template<typename Segments > | |
std::ostream & | printSegments (const Segments &segments, std::ostream &str) |
Print segments sizes, i.e. the segments setup. More... | |
template<int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView , typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real , typename... Args> | |
__global__ void | reduceSegmentsCSRAdaptiveKernel (BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args) |
template<typename BlocksView , typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real , typename... Args> | |
__global__ void | reduceSegmentsCSRAdaptiveKernel (BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args) |
template<int BlockSize, int ThreadsPerSegment, typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real > | |
__global__ void | reduceSegmentsCSRHybridMultivectorKernel (int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero) |
template<int ThreadsPerSegment, typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real > | |
__global__ void | reduceSegmentsCSRHybridVectorKernel (int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero) |
template<typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real , typename... Args> | |
__global__ void | reduceSegmentsCSRKernelVector (int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args) |
template<int BlockSize, int ThreadsPerSegment, typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real > | |
__global__ void | reduceSegmentsCSRLightMultivectorKernel (int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero) |
template<typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep > | |
__global__ void | SpMVCSRLight16 (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID) |
template<typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep > | |
__global__ void | SpMVCSRLight2 (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID) |
template<typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep > | |
__global__ void | SpMVCSRLight4 (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID) |
template<typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep > | |
__global__ void | SpMVCSRLight8 (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID) |
template<int ThreadsPerSegment, typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep > | |
__global__ void | SpMVCSRVector (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID) |
Namespace holding segments data structures.
Segments represent data structure for manipulation with several local arrays (denoted also as segments) having different size in general. All the local arrays are supposed to be allocated in one continuos global array. The data structure segments offers mapping between indexes of particular local arrays and indexes of the global array. In addition,one can perform parallel operations like for or flexible reduction on partical local arrays.
A typical example for use of segments is implementation of sparse matrices. Sparse matrix like the following
\[ \left( \begin{array}{ccccc} 1 & 0 & 2 & 0 & 0 \\ 0 & 0 & 5 & 0 & 0 \\ 3 & 4 & 7 & 9 & 0 \\ 0 & 0 & 0 & 0 & 12 \\ 0 & 0 & 15 & 17 & 20 \end{array} \right) \]
is usually first compressed which means that the zero elements are omitted to get the following "matrix":
\[ \begin{array}{ccccc} 1 & 2 \\ 5 \\ 3 & 4 & 7 & 9 \\ 12 \\ 15 & 17 & 20 \end{array} \]
We have to store column index of each matrix elements as well in a "matrix" like this:
\[ \begin{array}{ccccc} 0 & 2 \\ 2 \\ 0 & 1 & 2 & 3 \\ 4 \\ 2 & 3 & 4 \end{array} \]
Such "matrices" can be stored in memory in a row-wise manner in one contiguous array because of the performance reasons. The first "matrix" (i.e. values of the matrix elements) would be stored as follows
\[ \begin{array}{|cc|c|cccc|c|cc|} 1 & 2 & 5 & 3 & 4 & 7 & 9 & 12 & 15 & 17 & 20 \end{array} \]
and the second one (i.e. column indexes of the matrix values) as follows
\[ \begin{array}{|cc|c|cccc|c|cc|} 0 & 2 & 2 & 0 & 1 & 2 & 3 & 4 & 2 & 3 & 4 \end{array} \]
What we see above is so called CSR sparse matrix format. It is the most popular format for storage of sparse matrices designed for high performance. However, it may not be the most efficient format for storage of sparse matrices on GPUs. Therefore many other formats have been developed to get better performance. These formats often have different layout of the matrix elements in the memory. They have to deal especially with two difficulties:
Necessity of working with this kind of data structure is not limited only to sparse matrices. We could name at least few others:
In general, segments can be used for problems that somehow corresponds wit 2D data structure where each row can have different size and we need to perform miscellaneous operations within the rows. The name segments comes from segmented parallel reduction or segmented scan (prefix-sum).
The following example demonstrates the essence of segments in TNL:
We demonstrate two formats of segments - TNL::Algorithms::Segments::CSR and TNL::Algorithms::Segments::Ellpack running on both CPU and GPU (lines 58-76). For each of them, we call function SegmentsExample
which first creates given segments (line 18). The segments are defined by the sizes of particular segments.
Next we allocate array with data related to the segments (line 24). The number of elemets managed by the segments is given by TNL::Algorithms::Segments::CSR::getStorageSize and TNL::Algorithms::Segments::Ellpack::getStorageSize respectively.
Next we setup the segments elements (lines 29-33) by calling TNL::Algorithms::Segments::CSR::forAllElements (and TNL::Algorithms::Segments::CSR::forAllElements respectively) which iterates over all elements of the segments in parallel and perform given lambda function. The lambda function receives index of the segment (segmentIdx
), index of the element within the segment (localIdx
), index of the element within the array data
and a reference to boolean (compute
) which serves as a hint for interrupting the iteration over the elements of given segment when it is set to false
. The value of the elements having the local index smaller or equal to the segments index is set to the value of the segment index. It creates, in fact, lower triangular matrix elements of which have values equal to row index.
Next we use a function TNL::Algorithms::Segments::printSegments to print the content of the segments (lines 38-39). To do this we have to provide a lambda function fetch
(line 38) which returns value of elements with given global index.
Finally we show how to compute sum of all elemnts in each segment. Firstly, we create vector into which we will store the sums (line 44) and get its view (line 45). The size of the vector is given by the number of the segments which can be obtained by the means of the method TNL::Algorithms::Segments::CSR::getSegmentsCount (and TNL::Algorithms::Segments::Ellpack::getSegmentsCount respectively). The sums are computed using the method TNL::Algorithms::Segments::CSR::reduceAllSegments (and TNL::Algorithms::Segments::Ellpack::reduceAllSegments respectively) which works the same way as the flexible parallel reduction (TNL::Algorithms::reduce). It requires lambda functions fetch
for reading the data related to particular elements of the segments, function reduce
which is std::plus in this case and a function keep
to store the result of sums in particular segments.
The result looks as follows:
Note that the Ellpack format manages more elements than we asked for. It is because some formats use padding elements for more efficient memory accesses. The padding elements are available to the user as well and so we must ensure that work only with those elements we want to. This is the reason why we use the if statement on the line 31 when setting up the values of the elements in segments. The padding elements can be used in case when we later need more elements than we requested. However, the segments data structure does not allow any resizing of the segments. One can change the sizes of the segments, however, the access to the originally managed data is becoming invalid at that moment.
std::ostream & TNL::Algorithms::Segments::operator<< | ( | std::ostream & | str, |
const CSR< Device, Index, Kernel, IndexAllocator > & | segments | ||
) |
Insertion operator of CSR segments to output stream.
Device | is the device type of the source segments. |
Index | is the index type of the source segments. |
Kernel | is kernel type of the source segments. |
IndexAllocator | is the index allocator of the source segments. |
str | is the output stream. |
segments | are the source segments. |
std::ostream & TNL::Algorithms::Segments::printSegments | ( | const Segments & | segments, |
std::ostream & | str | ||
) |
Print segments sizes, i.e. the segments setup.
Segments | is type of segments. |
segments | is an instance of segments. |
str | is output stream. |