Template Numerical Library version\ main:8b8c8226
Loading...
Searching...
No Matches
Classes | Typedefs | Enumerations | Functions
TNL::Algorithms::Segments Namespace Reference

Namespace holding segments data structures. More...

Classes

class  BiEllpack
 
class  BiEllpackSegmentView
 
class  BiEllpackView
 
class  ChunkedEllpack
 
class  ChunkedEllpackSegmentView
 
class  ChunkedEllpackSegmentView< Index, ColumnMajorOrder >
 
class  ChunkedEllpackSegmentView< Index, RowMajorOrder >
 
class  ChunkedEllpackView
 
class  CSR
 Data structure for CSR segments format. More...
 
struct  CSRAdaptiveKernel
 
struct  CSRAdaptiveKernelView
 
struct  CSRHybridKernel
 
struct  CSRLightKernel
 
struct  CSRScalarKernel
 
struct  CSRScalarKernelreduceSegmentsDispatcher
 
struct  CSRScalarKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false >
 
struct  CSRScalarKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
 
struct  CSRVectorKernel
 
class  CSRView
 
struct  DefaultElementsOrganization
 
class  Ellpack
 
class  EllpackView
 
class  SegmentElement
 Simple structure representing one element of a segment. More...
 
class  SegmentView
 Data structure for accessing particular segment. More...
 
class  SegmentView< Index, ColumnMajorOrder >
 Data structure for accessing particular segment. More...
 
class  SegmentView< Index, RowMajorOrder >
 
class  SegmentViewIterator
 Iterator for iterating over elements of a segment. More...
 
class  SlicedEllpack
 
class  SlicedEllpackView
 

Typedefs

template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >>
using CSRAdaptive = CSR< Device, Index, CSRAdaptiveKernel< Index, Device >, IndexAllocator >
 
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >>
using CSRDefault = CSRScalar< Device, Index, IndexAllocator >
 
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >>
using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAllocator >
 
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >>
using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >
 
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >>
using CSRScalar = CSR< Device, Index, CSRScalarKernel< Index, Device >, IndexAllocator >
 
template<typename Device , typename Index , typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index >>
using CSRVector = CSR< Device, Index, CSRVectorKernel< Index, Device >, IndexAllocator >
 
template<typename Device , typename Index >
using CSRViewAdaptive = CSRView< Device, Index, CSRAdaptiveKernel< std::remove_const_t< Index >, Device > >
 
template<typename Device , typename Index >
using CSRViewDefault = CSRViewScalar< Device, Index >
 
template<typename Device , typename Index , int ThreadsInBlock = 256>
using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< std::remove_const_t< Index >, Device, ThreadsInBlock > >
 
template<typename Device , typename Index >
using CSRViewLight = CSRView< Device, Index, CSRLightKernel< std::remove_const_t< Index >, Device > >
 
template<typename Device , typename Index >
using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel< std::remove_const_t< Index >, Device > >
 
template<typename Device , typename Index >
using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< std::remove_const_t< Index >, Device > >
 

Enumerations

enum  ElementsOrganization { ColumnMajorOrder = 0 , RowMajorOrder }
 
enum  EllpackKernelType {
  Scalar , Vector , Vector2 , Vector4 ,
  Vector8 , Vector16
}
 
enum  LightCSRSThreadsMapping { LightCSRConstantThreads , CSRLightAutomaticThreads , CSRLightAutomaticThreadsLightSpMV }
 

Functions

template<typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real >
__global__ void EllpackCudaReductionKernelCompact (Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize)
 
template<typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real >
__global__ void EllpackCudaReductionKernelFull (Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize)
 
template<typename Device , typename Index , typename IndexAllocator , ElementsOrganization Organization, int WarpSize>
std::ostreamoperator<< (std::ostream &str, const BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize > &segments)
 
template<typename Device , typename Index , ElementsOrganization Organization, int WarpSize>
std::ostreamoperator<< (std::ostream &str, const BiEllpackView< Device, Index, Organization, WarpSize > &segments)
 
template<typename Device , typename Index , typename IndexAllocator , ElementsOrganization Organization>
std::ostreamoperator<< (std::ostream &str, const ChunkedEllpack< Device, Index, IndexAllocator, Organization > &segments)
 
template<typename Device , typename Index , typename Kernel , typename IndexAllocator >
std::ostreamoperator<< (std::ostream &str, const CSR< Device, Index, Kernel, IndexAllocator > &segments)
 Insertion operator of CSR segments to output stream. More...
 
template<typename Device , typename Index , typename Kernel >
std::ostreamoperator<< (std::ostream &str, const CSRView< Device, Index, Kernel > &segments)
 
template<typename Device , typename Index , typename IndexAllocator , ElementsOrganization Organization, int Alignment>
std::ostreamoperator<< (std::ostream &str, const Ellpack< Device, Index, IndexAllocator, Organization, Alignment > &segments)
 
template<typename Device , typename Index , ElementsOrganization Organization, int Alignment>
std::ostreamoperator<< (std::ostream &str, const EllpackView< Device, Index, Organization, Alignment > &ellpack)
 
template<typename Device , typename Index , typename IndexAllocator , ElementsOrganization Organization, int SliceSize>
std::ostreamoperator<< (std::ostream &str, const SlicedEllpack< Device, Index, IndexAllocator, Organization, SliceSize > &segments)
 
template<typename Device , typename Index , ElementsOrganization Organization, int SliceSize>
std::ostreamoperator<< (std::ostream &str, const SlicedEllpackView< Device, Index, Organization, SliceSize > &segments)
 
template<typename Segments >
std::ostreamprintSegments (const Segments &segments, std::ostream &str)
 Print segments sizes, i.e. the segments setup. More...
 
template<int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView , typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real , typename... Args>
__global__ void reduceSegmentsCSRAdaptiveKernel (BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args)
 
template<typename BlocksView , typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real , typename... Args>
__global__ void reduceSegmentsCSRAdaptiveKernel (BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args)
 
template<int BlockSize, int ThreadsPerSegment, typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real >
__global__ void reduceSegmentsCSRHybridMultivectorKernel (int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero)
 
template<int ThreadsPerSegment, typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real >
__global__ void reduceSegmentsCSRHybridVectorKernel (int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero)
 
template<typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real , typename... Args>
__global__ void reduceSegmentsCSRKernelVector (int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args)
 
template<int BlockSize, int ThreadsPerSegment, typename Offsets , typename Index , typename Fetch , typename Reduction , typename ResultKeeper , typename Real >
__global__ void reduceSegmentsCSRLightMultivectorKernel (int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero)
 
template<typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep >
__global__ void SpMVCSRLight16 (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID)
 
template<typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep >
__global__ void SpMVCSRLight2 (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID)
 
template<typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep >
__global__ void SpMVCSRLight4 (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID)
 
template<typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep >
__global__ void SpMVCSRLight8 (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID)
 
template<int ThreadsPerSegment, typename Real , typename Index , typename OffsetsView , typename Fetch , typename Reduce , typename Keep >
__global__ void SpMVCSRVector (OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID)
 

Detailed Description

Namespace holding segments data structures.

Segments represent data structure for manipulation with several local arrays (denoted also as segments) having different size in general. All the local arrays are supposed to be allocated in one continuos global array. The data structure segments offers mapping between indexes of particular local arrays and indexes of the global array. In addition,one can perform parallel operations like for or flexible reduction on partical local arrays.

A typical example for use of segments is implementation of sparse matrices. Sparse matrix like the following

\[ \left( \begin{array}{ccccc} 1 & 0 & 2 & 0 & 0 \\ 0 & 0 & 5 & 0 & 0 \\ 3 & 4 & 7 & 9 & 0 \\ 0 & 0 & 0 & 0 & 12 \\ 0 & 0 & 15 & 17 & 20 \end{array} \right) \]

is usually first compressed which means that the zero elements are omitted to get the following "matrix":

\[ \begin{array}{ccccc} 1 & 2 \\ 5 \\ 3 & 4 & 7 & 9 \\ 12 \\ 15 & 17 & 20 \end{array} \]

We have to store column index of each matrix elements as well in a "matrix" like this:

\[ \begin{array}{ccccc} 0 & 2 \\ 2 \\ 0 & 1 & 2 & 3 \\ 4 \\ 2 & 3 & 4 \end{array} \]

Such "matrices" can be stored in memory in a row-wise manner in one contiguous array because of the performance reasons. The first "matrix" (i.e. values of the matrix elements) would be stored as follows

\[ \begin{array}{|cc|c|cccc|c|cc|} 1 & 2 & 5 & 3 & 4 & 7 & 9 & 12 & 15 & 17 & 20 \end{array} \]

and the second one (i.e. column indexes of the matrix values) as follows

\[ \begin{array}{|cc|c|cccc|c|cc|} 0 & 2 & 2 & 0 & 1 & 2 & 3 & 4 & 2 & 3 & 4 \end{array} \]

What we see above is so called CSR sparse matrix format. It is the most popular format for storage of sparse matrices designed for high performance. However, it may not be the most efficient format for storage of sparse matrices on GPUs. Therefore many other formats have been developed to get better performance. These formats often have different layout of the matrix elements in the memory. They have to deal especially with two difficulties:

  1. Efficient storage of matrix elements in the memory to fulfill the requirements of coalesced memory accesses on GPUs or good spatial locality for efficient use of caches on CPUs.
  2. Efficient mapping of GPU threads to different matrix rows.

Necessity of working with this kind of data structure is not limited only to sparse matrices. We could name at least few others:

  1. Efficient storage of graphs - one segment represents one graph node, the elements in one segments are indexes of its neighbors.
  2. Unstructured numerical meshes - unstructured numerical mesh is a graph in fact.
  3. Particle in cell method - one segment represents one cell, the elements in one segment are indexes of the particles.
  4. K-means clustering - segments represent one cluster, the elements represent vectors belonging to given cluster.
  5. Hashing - segments are particular rows of the hash table, elements in segments corresponds with coliding hashed elements.

In general, segments can be used for problems that somehow corresponds wit 2D data structure where each row can have different size and we need to perform miscellaneous operations within the rows. The name segments comes from segmented parallel reduction or segmented scan (prefix-sum).

The following example demonstrates the essence of segments in TNL:

1#include <iostream>
2#include <functional>
3#include <TNL/Containers/Vector.h>
4#include <TNL/Algorithms/Segments/CSR.h>
5#include <TNL/Algorithms/Segments/Ellpack.h>
6#include <TNL/Devices/Host.h>
7#include <TNL/Devices/Cuda.h>
8
9template< typename Segments >
10void SegmentsExample()
11{
12 using DeviceType = typename Segments::DeviceType;
13 using IndexType = typename Segments::IndexType;
14
15 /***
16 * Create segments with given segments sizes.
17 */
18 Segments segments{ 1, 2, 3, 4, 5 };
19 std::cout << "Segments sizes are: " << segments << std::endl;
20
21 /***
22 * Allocate array for the segments;
23 */
24 TNL::Containers::Array< double, DeviceType > data( segments.getStorageSize(), 0.0 );
25
26 /***
27 * Insert data into particular segments.
28 */
29 auto data_view = data.getView();
30 segments.forAllElements( [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx ) mutable {
31 if( localIdx <= segmentIdx )
32 data_view[ globalIdx ] = segmentIdx;
33 } );
34
35 /***
36 * Print the data managed by the segments.
37 */
38 auto fetch = [=] __cuda_callable__ ( IndexType globalIdx ) -> double { return data_view[ globalIdx ]; };
39 printSegments( segments, fetch, std::cout );
40
41 /***
42 * Compute sums of elements in particular segments.
43 */
44 TNL::Containers::Vector< double, DeviceType, IndexType > sums( segments.getSegmentsCount() );
45 auto sums_view = sums.getView();
46 auto sum_fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) -> double {
47 return data_view[ globalIdx ];
48 };
49 auto keep = [=] __cuda_callable__ ( const IndexType& segmentIdx, const double& value ) mutable {
50 sums_view[ segmentIdx ] = value;
51 };
52 segments.reduceAllSegments( sum_fetch, std::plus<>{}, keep, 0.0 );
53 std::cout << "The sums are: " << sums << std::endl;
54}
55
56int main( int argc, char* argv[] )
57{
60
61 std::cout << "Example of CSR segments on host: " << std::endl;
62 SegmentsExample< HostCSR >();
63
64 std::cout << "Example of Ellpack segments on host: " << std::endl;
65 SegmentsExample< HostEllpack >();
66
67#ifdef __CUDACC__
70
71 std::cout << "Example of CSR segments on CUDA GPU: " << std::endl;
72 SegmentsExample< CudaCSR >();
73
74 std::cout << "Example of Ellpack segments on CUDA GPU: " << std::endl;
75 SegmentsExample< CudaEllpack >();
76#endif
77 return EXIT_SUCCESS;
78}
#define __cuda_callable__
Definition: CudaCallable.h:22
Data structure for CSR segments format.
Definition: CSR.h:38
Definition: Ellpack.h:23
Array is responsible for memory management, access to array elements, and general array operations.
Definition: Array.h:67
Vector extends Array with algebraic operations.
Definition: Vector.h:40
T endl(T... args)
std::ostream & printSegments(const Segments &segments, std::ostream &str)
Print segments sizes, i.e. the segments setup.
Definition: SegmentsPrinting.h:31

We demonstrate two formats of segments - TNL::Algorithms::Segments::CSR and TNL::Algorithms::Segments::Ellpack running on both CPU and GPU (lines 58-76). For each of them, we call function SegmentsExample which first creates given segments (line 18). The segments are defined by the sizes of particular segments.

Next we allocate array with data related to the segments (line 24). The number of elemets managed by the segments is given by TNL::Algorithms::Segments::CSR::getStorageSize and TNL::Algorithms::Segments::Ellpack::getStorageSize respectively.

Next we setup the segments elements (lines 29-33) by calling TNL::Algorithms::Segments::CSR::forAllElements (and TNL::Algorithms::Segments::CSR::forAllElements respectively) which iterates over all elements of the segments in parallel and perform given lambda function. The lambda function receives index of the segment (segmentIdx), index of the element within the segment (localIdx), index of the element within the array data and a reference to boolean (compute) which serves as a hint for interrupting the iteration over the elements of given segment when it is set to false. The value of the elements having the local index smaller or equal to the segments index is set to the value of the segment index. It creates, in fact, lower triangular matrix elements of which have values equal to row index.

Next we use a function TNL::Algorithms::Segments::printSegments to print the content of the segments (lines 38-39). To do this we have to provide a lambda function fetch (line 38) which returns value of elements with given global index.

Finally we show how to compute sum of all elemnts in each segment. Firstly, we create vector into which we will store the sums (line 44) and get its view (line 45). The size of the vector is given by the number of the segments which can be obtained by the means of the method TNL::Algorithms::Segments::CSR::getSegmentsCount (and TNL::Algorithms::Segments::Ellpack::getSegmentsCount respectively). The sums are computed using the method TNL::Algorithms::Segments::CSR::reduceAllSegments (and TNL::Algorithms::Segments::Ellpack::reduceAllSegments respectively) which works the same way as the flexible parallel reduction (TNL::Algorithms::reduce). It requires lambda functions fetch for reading the data related to particular elements of the segments, function reduce which is std::plus in this case and a function keep to store the result of sums in particular segments.

The result looks as follows:

Example of CSR segments on host:
Segments sizes are: [ 1, 2, 3, 4, 5, ]
Seg. 0: [ 0 ]
Seg. 1: [ 1, 1 ]
Seg. 2: [ 2, 2, 2 ]
Seg. 3: [ 3, 3, 3, 3 ]
Seg. 4: [ 4, 4, 4, 4, 4 ]
The sums are: [ 0, 2, 6, 12, 20 ]
Example of Ellpack segments on host:
Segments sizes are: [ 5, 5, 5, 5, 5, ]
Seg. 0: [ 0, 0, 0, 0, 0 ]
Seg. 1: [ 1, 1, 0, 0, 0 ]
Seg. 2: [ 2, 2, 2, 0, 0 ]
Seg. 3: [ 3, 3, 3, 3, 0 ]
Seg. 4: [ 4, 4, 4, 4, 4 ]
The sums are: [ 0, 2, 6, 12, 20 ]
Example of CSR segments on CUDA GPU:
Segments sizes are: [ 1, 2, 3, 4, 5, ]
Seg. 0: [ 0 ]
Seg. 1: [ 1, 1 ]
Seg. 2: [ 2, 2, 2 ]
Seg. 3: [ 3, 3, 3, 3 ]
Seg. 4: [ 4, 4, 4, 4, 4 ]
The sums are: [ 0, 2, 6, 12, 20 ]
Example of Ellpack segments on CUDA GPU:
Segments sizes are: [ 5, 5, 5, 5, 5, ]
Seg. 0: [ 0, 0, 0, 0, 0 ]
Seg. 1: [ 1, 1, 0, 0, 0 ]
Seg. 2: [ 2, 2, 2, 0, 0 ]
Seg. 3: [ 3, 3, 3, 3, 0 ]
Seg. 4: [ 4, 4, 4, 4, 4 ]
The sums are: [ 0, 2, 6, 12, 20 ]

Note that the Ellpack format manages more elements than we asked for. It is because some formats use padding elements for more efficient memory accesses. The padding elements are available to the user as well and so we must ensure that work only with those elements we want to. This is the reason why we use the if statement on the line 31 when setting up the values of the elements in segments. The padding elements can be used in case when we later need more elements than we requested. However, the segments data structure does not allow any resizing of the segments. One can change the sizes of the segments, however, the access to the originally managed data is becoming invalid at that moment.

Enumeration Type Documentation

◆ ElementsOrganization

Enumerator
ColumnMajorOrder 

Column-major order.

RowMajorOrder 

Row-major order.

Function Documentation

◆ operator<<()

template<typename Device , typename Index , typename Kernel , typename IndexAllocator >
std::ostream & TNL::Algorithms::Segments::operator<< ( std::ostream str,
const CSR< Device, Index, Kernel, IndexAllocator > &  segments 
)

Insertion operator of CSR segments to output stream.

Template Parameters
Deviceis the device type of the source segments.
Indexis the index type of the source segments.
Kernelis kernel type of the source segments.
IndexAllocatoris the index allocator of the source segments.
Parameters
stris the output stream.
segmentsare the source segments.
Returns
reference to the output stream.

◆ printSegments()

template<typename Segments >
std::ostream & TNL::Algorithms::Segments::printSegments ( const Segments &  segments,
std::ostream str 
)

Print segments sizes, i.e. the segments setup.

Template Parameters
Segmentsis type of segments.
Parameters
segmentsis an instance of segments.
stris output stream.
Returns
reference to the output stream.
Example
#include <iostream>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/Segments/CSR.h>
#include <TNL/Algorithms/Segments/Ellpack.h>
#include <TNL/Algorithms/Segments/ChunkedEllpack.h>
#include <TNL/Algorithms/Segments/BiEllpack.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
template< typename Segments >
void SegmentsExample()
{
/***
* Create segments with given segments sizes and print their setup.
*/
Segments segments{ 1, 2, 3, 4, 5 };
std::cout << "Segments sizes are: " << segments << std::endl << std::endl;
}
int main( int argc, char* argv[] )
{
std::cout << "Example of CSR segments on host: " << std::endl;
SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Host, int > >();
std::cout << "Example of Ellpack segments on host: " << std::endl;
SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Host, int > >();
std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Host, int > >();
std::cout << "Example of BiEllpack segments on host: " << std::endl;
SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Host, int > >();
#ifdef __CUDACC__
std::cout << "Example of CSR segments on host: " << std::endl;
SegmentsExample< TNL::Algorithms::Segments::CSR< TNL::Devices::Cuda, int > >();
std::cout << "Example of Ellpack segments on host: " << std::endl;
SegmentsExample< TNL::Algorithms::Segments::Ellpack< TNL::Devices::Cuda, int > >();
std::cout << "Example of ChunkedEllpack segments on host: " << std::endl;
SegmentsExample< TNL::Algorithms::Segments::ChunkedEllpack< TNL::Devices::Cuda, int > >();
std::cout << "Example of BiEllpack segments on host: " << std::endl;
SegmentsExample< TNL::Algorithms::Segments::BiEllpack< TNL::Devices::Cuda, int > >();
#endif
return EXIT_SUCCESS;
}
Output
Example of CSR segments on host:
Segments sizes are: [ 1, 2, 3, 4, 5, ]
Example of Ellpack segments on host:
Segments sizes are: [ 5, 5, 5, 5, 5, ]
Example of ChunkedEllpack segments on host:
Segments sizes are: [ 17, 34, 51, 67, 87, ]
Example of BiEllpack segments on host:
Segments sizes are: [ 2, 4, 4, 5, 5, ]
Example of CSR segments on host:
Segments sizes are: [ 1, 2, 3, 4, 5, ]
Example of Ellpack segments on host:
Segments sizes are: [ 5, 5, 5, 5, 5, ]
Example of ChunkedEllpack segments on host:
Segments sizes are: [ 17, 34, 51, 67, 87, ]
Example of BiEllpack segments on host:
Segments sizes are: [ 2, 4, 4, 5, 5, ]