Template Numerical Library version\ main:94209208
Loading...
Searching...
No Matches
Namespaces | Classes | Functions
TNL::Algorithms Namespace Reference

Namespace for fundamental TNL algorithms. More...

Namespaces

namespace  Segments
 Namespace for the segments data structures.
 

Classes

struct  AtomicOperations
 
struct  AtomicOperations< Devices::Cuda >
 
struct  AtomicOperations< Devices::Host >
 
struct  AtomicOperations< Devices::Sequential >
 
class  CudaReductionBuffer
 
struct  Multireduction
 
struct  Multireduction< Devices::Cuda >
 
struct  Multireduction< Devices::Host >
 
struct  Multireduction< Devices::Sequential >
 
struct  SegmentedScan
 Computes segmented scan (or prefix sum) on a vector. More...
 
struct  SegmentedScan< Devices::Cuda, Type >
 
struct  SegmentedScan< Devices::Host, Type >
 
struct  SegmentedScan< Devices::Sequential, Type >
 
struct  SequentialFor
 Wrapper to ParallelFor which makes it run sequentially. More...
 

Functions

template<typename Array , typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>
void ascendingSort (Array &array, const Sorter &sorter=Sorter{})
 Function for sorting elements of array or vector in ascending order.
 
template<typename Array >
bool contains (const Array &array, typename Array::ValueType value, typename Array::IndexType begin=0, typename Array::IndexType end=0)
 Checks if an array/vector/view contains an element with given value.
 
template<typename Array >
bool containsOnlyValue (const Array &array, typename Array::ValueType value, typename Array::IndexType begin=0, typename Array::IndexType end=0)
 Checks if all elements of an array/vector/view have the given value.
 
template<typename DestinationDevice , typename SourceDevice = DestinationDevice, typename DestinationElement , typename SourceElement , typename Index >
void copy (DestinationElement *destination, const SourceElement *source, Index size)
 Copies memory from source to destination.
 
template<typename DestinationDevice , typename DestinationElement , typename Index , typename SourceIterator >
void copy (DestinationElement *destination, Index destinationSize, SourceIterator begin, SourceIterator end)
 Copies memory from source iterator range to destination.
 
template<typename Array , typename DestinationElement , typename = std::enable_if_t< IsArrayType< Array >::value >>
void copy (std::vector< DestinationElement > &destination, const Array &source)
 Copies memory from the source TNL array-like container to the destination STL vector.
 
template<typename Array , typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>
void descendingSort (Array &array, const Sorter &sorter=Sorter{})
 Function for sorting elements of array or vector in descending order.
 
template<typename InputDistributedArray , typename OutputDistributedArray , typename Reduction >
void distributedExclusiveScan (const InputDistributedArray &input, OutputDistributedArray &output, typename InputDistributedArray::IndexType begin, typename InputDistributedArray::IndexType end, Reduction &&reduction, typename OutputDistributedArray::ValueType identity)
 Computes an exclusive scan (or prefix sum) of a distributed array in-place.
 
template<typename InputDistributedArray , typename OutputDistributedArray , typename Reduction = TNL::Plus>
void distributedExclusiveScan (const InputDistributedArray &input, OutputDistributedArray &output, typename InputDistributedArray::IndexType begin=0, typename InputDistributedArray::IndexType end=0, Reduction &&reduction=TNL::Plus{})
 Overload of distributedExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.
 
template<typename InputDistributedArray , typename OutputDistributedArray , typename Reduction >
void distributedInclusiveScan (const InputDistributedArray &input, OutputDistributedArray &output, typename InputDistributedArray::IndexType begin, typename InputDistributedArray::IndexType end, Reduction &&reduction, typename OutputDistributedArray::ValueType identity)
 Computes an inclusive scan (or prefix sum) of a distributed array in-place.
 
template<typename InputDistributedArray , typename OutputDistributedArray , typename Reduction = TNL::Plus>
void distributedInclusiveScan (const InputDistributedArray &input, OutputDistributedArray &output, typename InputDistributedArray::IndexType begin=0, typename InputDistributedArray::IndexType end=0, Reduction &&reduction=TNL::Plus{})
 Overload of distributedInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.
 
template<typename DistributedArray , typename Reduction >
void distributedInplaceExclusiveScan (DistributedArray &array, typename DistributedArray::IndexType begin, typename DistributedArray::IndexType end, Reduction &&reduction, typename DistributedArray::ValueType identity)
 Computes an exclusive scan (or prefix sum) of a distributed array in-place.
 
template<typename DistributedArray , typename Reduction = TNL::Plus>
void distributedInplaceExclusiveScan (DistributedArray &array, typename DistributedArray::IndexType begin=0, typename DistributedArray::IndexType end=0, Reduction &&reduction=TNL::Plus{})
 Overload of distributedInplaceExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.
 
template<typename DistributedArray , typename Reduction >
void distributedInplaceInclusiveScan (DistributedArray &array, typename DistributedArray::IndexType begin, typename DistributedArray::IndexType end, Reduction &&reduction, typename DistributedArray::ValueType identity)
 Computes an inclusive scan (or prefix sum) of a distributed array in-place.
 
template<typename DistributedArray , typename Reduction = TNL::Plus>
void distributedInplaceInclusiveScan (DistributedArray &array, typename DistributedArray::IndexType begin=0, typename DistributedArray::IndexType end=0, Reduction &&reduction=TNL::Plus{})
 Overload of distributedInplaceInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.
 
template<typename DestinationDevice , typename SourceDevice = DestinationDevice, typename DestinationElement , typename SourceElement , typename Index >
bool equal (DestinationElement *destination, const SourceElement *source, Index size)
 Compares memory from source with destination.
 
template<typename InputArray , typename OutputArray , typename Reduction >
void exclusiveScan (const InputArray &input, OutputArray &output, typename InputArray::IndexType begin, typename InputArray::IndexType end, typename OutputArray::IndexType outputBegin, Reduction &&reduction, typename OutputArray::ValueType identity)
 Computes an exclusive scan (or prefix sum) of an input array and stores it in an output array.
 
template<typename InputArray , typename OutputArray , typename Reduction = TNL::Plus>
void exclusiveScan (const InputArray &input, OutputArray &output, typename InputArray::IndexType begin=0, typename InputArray::IndexType end=0, typename OutputArray::IndexType outputBegin=0, Reduction &&reduction=TNL::Plus{})
 Overload of exclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.
 
template<typename Device , typename Element , typename Index >
void fill (Element *data, const Element &value, Index size)
 Fills memory between data and data + size with a value.
 
template<typename Device , typename Element , typename Index >
void fillRandom (Element *data, Index size, Element min_val, Element max_val)
 Fills memory between data and data + size with random Element values in the given range.
 
template<typename Container , typename ValueType >
std::pair< bool, typename Container::IndexType > find (const Container &container, const ValueType &value)
 Find the first occurrence of a value in an array.
 
template<typename InputArray , typename OutputArray , typename Reduction >
void inclusiveScan (const InputArray &input, OutputArray &output, typename InputArray::IndexType begin, typename InputArray::IndexType end, typename OutputArray::IndexType outputBegin, Reduction &&reduction, typename OutputArray::ValueType identity)
 Computes an inclusive scan (or prefix sum) of an input array and stores it in an output array.
 
template<typename InputArray , typename OutputArray , typename Reduction = TNL::Plus>
void inclusiveScan (const InputArray &input, OutputArray &output, typename InputArray::IndexType begin=0, typename InputArray::IndexType end=0, typename OutputArray::IndexType outputBegin=0, Reduction &&reduction=TNL::Plus{})
 Overload of inclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.
 
template<typename Array , typename Reduction >
void inplaceExclusiveScan (Array &array, typename Array::IndexType begin, typename Array::IndexType end, Reduction &&reduction, typename Array::ValueType identity)
 Computes an exclusive scan (or prefix sum) of an array in-place.
 
template<typename Array , typename Reduction = TNL::Plus>
void inplaceExclusiveScan (Array &array, typename Array::IndexType begin=0, typename Array::IndexType end=0, Reduction &&reduction=TNL::Plus{})
 Overload of inplaceExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.
 
template<typename Array , typename Reduction >
void inplaceInclusiveScan (Array &array, typename Array::IndexType begin, typename Array::IndexType end, Reduction &&reduction, typename Array::ValueType identity)
 Computes an inclusive scan (or prefix sum) of an array in-place.
 
template<typename Array , typename Reduction = TNL::Plus>
void inplaceInclusiveScan (Array &array, typename Array::IndexType begin=0, typename Array::IndexType end=0, Reduction &&reduction=TNL::Plus{})
 Overload of inplaceInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.
 
template<typename Array >
bool isAscending (const Array &arr)
 Functions returning true if the array elements are sorted in ascending order.
 
template<typename Array >
bool isDescending (const Array &arr)
 Functions returning true if the array elements are sorted in descending order.
 
template<typename Array , typename Compare >
bool isSorted (const Array &arr, const Compare &compare)
 Functions returning true if the array elements are sorted according to the lmabda function comparison.
 
template<typename Device , typename Begin , typename End , typename Function , typename... FunctionArgs>
std::enable_if_t< std::is_integral_v< Begin > &&std::is_integral_v< End > > parallelFor (const Begin &begin, const End &end, Function f, FunctionArgs... args)
 Parallel for-loop function for 1D range specified with integral values with default launch configuration.
 
template<typename Device , typename Begin , typename End , typename Function , typename... FunctionArgs>
std::enable_if_t< IsStaticArrayType< Begin >::value &&IsStaticArrayType< End >::value > parallelFor (const Begin &begin, const End &end, Function f, FunctionArgs... args)
 Parallel for-loop function for range specified with multi-index values with default launch configuration.
 
template<typename Device , typename Begin , typename End , typename Function , typename... FunctionArgs>
std::enable_if_t< std::is_integral_v< Begin > &&std::is_integral_v< End > > parallelFor (const Begin &begin, const End &end, typename Device::LaunchConfiguration launch_config, Function f, FunctionArgs... args)
 Parallel for-loop function for 1D range specified with integral values.
 
template<typename Device , typename Begin , typename End , typename Function , typename... FunctionArgs>
std::enable_if_t< IsStaticArrayType< Begin >::value &&IsStaticArrayType< End >::value > parallelFor (const Begin &begin, const End &end, typename Device::LaunchConfiguration launch_config, Function f, FunctionArgs... args)
 Parallel for-loop function for range specified with multi-index values.
 
template<typename Array , typename Device = typename Array::DeviceType, typename Reduction , typename Result >
auto reduce (const Array &array, Reduction &&reduction, Result identity)
 Variant of reduce for arrays, views and compatible objects.
 
template<typename Array , typename Device = typename Array::DeviceType, typename Reduction = TNL::Plus>
auto reduce (const Array &array, Reduction &&reduction=TNL::Plus{})
 Variant of reduce for arrays, views and compatible objects.
 
template<typename Device , typename Index , typename Result , typename Fetch , typename Reduction >
Result reduce (Index begin, Index end, Fetch &&fetch, Reduction &&reduction, const Result &identity)
 reduce implements (parallel) reduction for vectors and arrays.
 
template<typename Device , typename Index , typename Fetch , typename Reduction = TNL::Plus>
auto reduce (Index begin, Index end, Fetch &&fetch, Reduction &&reduction=TNL::Plus{})
 Variant of reduce with functional instead of reduction lambda function.
 
template<typename Array , typename Device = typename Array::DeviceType, typename Reduction >
auto reduceWithArgument (const Array &array, Reduction &&reduction)
 Variant of reduceWithArgument for arrays, views and compatible objects.
 
template<typename Array , typename Device = typename Array::DeviceType, typename Reduction , typename Result >
auto reduceWithArgument (const Array &array, Reduction &&reduction, Result identity)
 Variant of reduceWithArgument for arrays, views and compatible objects.
 
template<typename Device , typename Index , typename Fetch , typename Reduction >
auto reduceWithArgument (Index begin, Index end, Fetch &&fetch, Reduction &&reduction)
 Variant of reduceWithArgument with functional instead of reduction lambda function.
 
template<typename Device , typename Index , typename Result , typename Fetch , typename Reduction >
std::pair< Result, Index > reduceWithArgument (Index begin, Index end, Fetch &&fetch, Reduction &&reduction, const Result &identity)
 Variant of reduce returning also the position of the element of interest.
 
template<typename Array , typename Compare , typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>
void sort (Array &array, const Compare &compare, const Sorter &sorter=Sorter{})
 Function for sorting elements of array or vector based on a user defined comparison lambda function.
 
template<typename Device , typename Index , typename Compare , typename Swap , typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType>
void sort (const Index begin, const Index end, Compare &&compare, Swap &&swap, const Sorter &sorter=Sorter{})
 Function for general sorting based on lambda functions for comparison and swaping of two elements..
 
template<typename Index , Index begin, Index end, typename Func , typename... ArgTypes>
constexpr void staticFor (Func &&f, ArgTypes &&... args)
 Generic loop with constant bounds and indices usable in constant expressions.
 
template<typename Index , Index begin, Index end, Index unrollFactor = 8, typename Func >
constexpr void unrolledFor (Func &&f)
 Generic for-loop with explicit unrolling.
 

Detailed Description

Namespace for fundamental TNL algorithms.

It contains algorithms like for-loops, memory operations, (parallel) reduction, multireduction, scan etc.

Function Documentation

◆ ascendingSort()

template<typename Array , typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>
void TNL::Algorithms::ascendingSort ( Array & array,
const Sorter & sorter = Sorter{} )

Function for sorting elements of array or vector in ascending order.

Template Parameters
Arrayis a type of container to be sorted. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Sorteris an algorithm for sorting. It can be TNL::Algorithms::Sorting::STLSort for sorting on host and TNL::Algorithms::Sorting::Quicksort or TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU.
Parameters
arrayis an instance of array/array view/vector/vector view for sorting.
sorteris an instance of sorter.
Example
1#include <iostream>
2#include <TNL/Containers/Array.h>
3#include <TNL/Algorithms/sort.h>
4
5using namespace TNL;
6using namespace TNL::Containers;
7using namespace TNL::Algorithms;
8
9template< typename ArrayT >
10void
11sort( ArrayT& array )
12{
13 const int size = 10;
14
15 /****
16 * Fill the array with random integers.
17 */
18 Array< int > aux_array( size );
19 srand( size + 2021 );
20 parallelFor< Devices::Host >( 0,
21 size,
22 [ & ]( int i )
23 {
24 aux_array[ i ] = std::rand() % ( 2 * size );
25 } );
26 array = aux_array;
27
28 std::cout << "Random array: " << array << std::endl;
29
30 /****
31 * Sort the array in ascending order.
32 */
33 ascendingSort( array );
34 std::cout << "Array sorted in ascending order:" << array << std::endl;
35
36 /***
37 * Sort the array in descending order.
38 */
39 descendingSort( array );
40 std::cout << "Array sorted in descending order:" << array << std::endl;
41}
42
43int
44main( int argc, char* argv[] )
45{
46 /***
47 * Firstly, test the sorting on CPU.
48 */
49 std::cout << "Sorting on CPU ... " << std::endl;
51 sort( host_array );
52
53#ifdef __CUDACC__
54 /***
55 * And then also on GPU.
56 */
57 std::cout << "Sorting on GPU ... " << std::endl;
59 sort( cuda_array );
60#endif
61 return EXIT_SUCCESS;
62}
Array is responsible for memory management, access to array elements, and general array operations.
Definition Array.h:64
T endl(T... args)
Namespace for fundamental TNL algorithms.
Definition AtomicOperations.h:9
Namespace for TNL containers.
Definition Array.h:17
The main TNL namespace.
Definition AtomicOperations.h:9
T rand(T... args)
Output
Sorting on CPU ...
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]
Sorting on GPU ...
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]

◆ contains()

template<typename Array >
bool TNL::Algorithms::contains ( const Array & array,
typename Array::ValueType value,
typename Array::IndexType begin = 0,
typename Array::IndexType end = 0 )

Checks if an array/vector/view contains an element with given value.

By default, all elements of the array are checked. If begin or end is set to a non-zero value, only elements in the sub-interval [begin, end) are checked.

Parameters
arrayThe array to be searched.
valueThe value to be checked.
beginThe beginning of the array sub-interval. It is 0 by default.
endThe end of the array sub-interval. The default value is 0 which is, however, replaced with the array size.
Returns
true if there is at least one element in the sub-interval [begin, end) which has the value value. Returns false if the range is empty.

◆ containsOnlyValue()

template<typename Array >
bool TNL::Algorithms::containsOnlyValue ( const Array & array,
typename Array::ValueType value,
typename Array::IndexType begin = 0,
typename Array::IndexType end = 0 )

Checks if all elements of an array/vector/view have the given value.

By default, all elements of the array are checked. If begin or end is set to a non-zero value, only elements in the sub-interval [begin, end) are checked.

Parameters
arrayThe array to be searched.
valueThe value to be checked.
beginThe beginning of the array sub-interval. It is 0 by default.
endThe end of the array sub-interval. The default value is 0 which is, however, replaced with the array size.
Returns
true if all elements in the sub-interval [begin, end) have the same value value. Returns true if the range is empty.

◆ copy() [1/3]

template<typename DestinationDevice , typename SourceDevice = DestinationDevice, typename DestinationElement , typename SourceElement , typename Index >
void TNL::Algorithms::copy ( DestinationElement * destination,
const SourceElement * source,
Index size )

Copies memory from source to destination.

The source data is allocated on the device specified by SourceDevice and the destination data is allocated on the device specified by DestinationDevice.

Template Parameters
DestinationDeviceis the device where the destination data is allocated.
SourceDeviceis the device where the source data is allocated.
DestinationElementis the type of the destination data.
SourceElementis the type of the source data.
Indexis the type of the size of the data.
Parameters
destinationis the pointer to the destination data.
sourceis the pointer to the source data.
sizeis the size of the data.

◆ copy() [2/3]

template<typename DestinationDevice , typename DestinationElement , typename Index , typename SourceIterator >
void TNL::Algorithms::copy ( DestinationElement * destination,
Index destinationSize,
SourceIterator begin,
SourceIterator end )

Copies memory from source iterator range to destination.

The source data must be allocated on the host device. The destination data is allocated on the device specified by DestinationDevice.

Template Parameters
DestinationDeviceis the device where the destination data is allocated.
DestinationElementis the type of the destination data.
Indexis the type of the size of the data.
SourceIteratoris the iterator type for the source data.
Parameters
destinationis the pointer to the destination data.
destinationSizeis the size of the destination data.
beginis the iterator to the first element of the source data range.
endis the one-past-the-end iterator of the source data range.

◆ copy() [3/3]

template<typename Array , typename DestinationElement , typename = std::enable_if_t< IsArrayType< Array >::value >>
void TNL::Algorithms::copy ( std::vector< DestinationElement > & destination,
const Array & source )

Copies memory from the source TNL array-like container to the destination STL vector.

Template Parameters
Arrayis the type of array where the source data is stored. It can be for example TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector or TNL::Containers::VectorView.
DestinationElementis the type of the destination data stored in the STL vector.
Parameters
destinationis the destination STL vector.
sourceis the source TNL array.

◆ descendingSort()

template<typename Array , typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>
void TNL::Algorithms::descendingSort ( Array & array,
const Sorter & sorter = Sorter{} )

Function for sorting elements of array or vector in descending order.

Template Parameters
Arrayis a type of container to be sorted. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Sorteris an algorithm for sorting. It can be TNL::Algorithms::Sorting::STLSort for sorting on host and TNL::Algorithms::Sorting::Quicksort or TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU.
Parameters
arrayis an instance of array/array view/vector/vector view for sorting.
sorteris an instance of sorter.
Example
1#include <iostream>
2#include <TNL/Containers/Array.h>
3#include <TNL/Algorithms/sort.h>
4
5using namespace TNL;
6using namespace TNL::Containers;
7using namespace TNL::Algorithms;
8
9template< typename ArrayT >
10void
11sort( ArrayT& array )
12{
13 const int size = 10;
14
15 /****
16 * Fill the array with random integers.
17 */
18 Array< int > aux_array( size );
19 srand( size + 2021 );
20 parallelFor< Devices::Host >( 0,
21 size,
22 [ & ]( int i )
23 {
24 aux_array[ i ] = std::rand() % ( 2 * size );
25 } );
26 array = aux_array;
27
28 std::cout << "Random array: " << array << std::endl;
29
30 /****
31 * Sort the array in ascending order.
32 */
33 ascendingSort( array );
34 std::cout << "Array sorted in ascending order:" << array << std::endl;
35
36 /***
37 * Sort the array in descending order.
38 */
39 descendingSort( array );
40 std::cout << "Array sorted in descending order:" << array << std::endl;
41}
42
43int
44main( int argc, char* argv[] )
45{
46 /***
47 * Firstly, test the sorting on CPU.
48 */
49 std::cout << "Sorting on CPU ... " << std::endl;
51 sort( host_array );
52
53#ifdef __CUDACC__
54 /***
55 * And then also on GPU.
56 */
57 std::cout << "Sorting on GPU ... " << std::endl;
59 sort( cuda_array );
60#endif
61 return EXIT_SUCCESS;
62}
Output
Sorting on CPU ...
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]
Sorting on GPU ...
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]

◆ distributedExclusiveScan() [1/2]

template<typename InputDistributedArray , typename OutputDistributedArray , typename Reduction >
void TNL::Algorithms::distributedExclusiveScan ( const InputDistributedArray & input,
OutputDistributedArray & output,
typename InputDistributedArray::IndexType begin,
typename InputDistributedArray::IndexType end,
Reduction && reduction,
typename OutputDistributedArray::ValueType identity )

Computes an exclusive scan (or prefix sum) of a distributed array in-place.

Exclusive scan (or prefix sum) operation turns a sequence \(a_1, \ldots, a_n\) into a sequence \(\sigma_1, \ldots, \sigma_n\) defined as

\[ \sigma_i = \sum_{j=1}^{i-1} a_i. \]

Template Parameters
DistributedArraytype of the distributed array to be scanned
Reductiontype of the reduction functor
Parameters
inputinput array
outputoutput array
beginthe first element in the array to be scanned
endthe last element in the array to be scanned
reductionfunctor implementing the reduction operation
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
#define __cuda_callable__
Definition Macros.h:49

◆ distributedExclusiveScan() [2/2]

template<typename InputDistributedArray , typename OutputDistributedArray , typename Reduction = TNL::Plus>
void TNL::Algorithms::distributedExclusiveScan ( const InputDistributedArray & input,
OutputDistributedArray & output,
typename InputDistributedArray::IndexType begin = 0,
typename InputDistributedArray::IndexType end = 0,
Reduction && reduction = TNL::Plus{} )

Overload of distributedExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename OutputDistributedArray::ValueType >(). See distributedExclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to input.getSize().

◆ distributedInclusiveScan() [1/2]

template<typename InputDistributedArray , typename OutputDistributedArray , typename Reduction >
void TNL::Algorithms::distributedInclusiveScan ( const InputDistributedArray & input,
OutputDistributedArray & output,
typename InputDistributedArray::IndexType begin,
typename InputDistributedArray::IndexType end,
Reduction && reduction,
typename OutputDistributedArray::ValueType identity )

Computes an inclusive scan (or prefix sum) of a distributed array in-place.

Inclusive scan (or prefix sum) operation turns a sequence \(a_1, \ldots, a_n\) into a sequence \(s_1, \ldots, s_n\) defined as

\[ s_i = \sum_{j=1}^i a_i. \]

Template Parameters
DistributedArraytype of the distributed array to be scanned
Reductiontype of the reduction functor
Parameters
inputinput array
outputoutput array
beginthe first element in the array to be scanned
endthe last element in the array to be scanned
reductionfunctor implementing the reduction operation
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

◆ distributedInclusiveScan() [2/2]

template<typename InputDistributedArray , typename OutputDistributedArray , typename Reduction = TNL::Plus>
void TNL::Algorithms::distributedInclusiveScan ( const InputDistributedArray & input,
OutputDistributedArray & output,
typename InputDistributedArray::IndexType begin = 0,
typename InputDistributedArray::IndexType end = 0,
Reduction && reduction = TNL::Plus{} )

Overload of distributedInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename OutputDistributedArray::ValueType >(). See distributedInclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to input.getSize().

◆ distributedInplaceExclusiveScan() [1/2]

template<typename DistributedArray , typename Reduction >
void TNL::Algorithms::distributedInplaceExclusiveScan ( DistributedArray & array,
typename DistributedArray::IndexType begin,
typename DistributedArray::IndexType end,
Reduction && reduction,
typename DistributedArray::ValueType identity )

Computes an exclusive scan (or prefix sum) of a distributed array in-place.

Exclusive scan (or prefix sum) operation turns a sequence \(a_1, \ldots, a_n\) into a sequence \(\sigma_1, \ldots, \sigma_n\) defined as

\[ \sigma_i = \sum_{j=1}^{i-1} a_i. \]

Template Parameters
DistributedArraytype of the distributed array to be scanned
Reductiontype of the reduction functor
Parameters
arrayinput array, the result of scan is stored in the same array
beginthe first element in the array to be scanned
endthe last element in the array to be scanned
reductionfunctor implementing the reduction operation
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

◆ distributedInplaceExclusiveScan() [2/2]

template<typename DistributedArray , typename Reduction = TNL::Plus>
void TNL::Algorithms::distributedInplaceExclusiveScan ( DistributedArray & array,
typename DistributedArray::IndexType begin = 0,
typename DistributedArray::IndexType end = 0,
Reduction && reduction = TNL::Plus{} )

Overload of distributedInplaceExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename DistributedArray::ValueType >(). See distributedInplaceExclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to array.getSize().

◆ distributedInplaceInclusiveScan() [1/2]

template<typename DistributedArray , typename Reduction >
void TNL::Algorithms::distributedInplaceInclusiveScan ( DistributedArray & array,
typename DistributedArray::IndexType begin,
typename DistributedArray::IndexType end,
Reduction && reduction,
typename DistributedArray::ValueType identity )

Computes an inclusive scan (or prefix sum) of a distributed array in-place.

Inclusive scan (or prefix sum) operation turns a sequence \(a_1, \ldots, a_n\) into a sequence \(s_1, \ldots, s_n\) defined as

\[ s_i = \sum_{j=1}^i a_i. \]

Template Parameters
DistributedArraytype of the distributed array to be scanned
Reductiontype of the reduction functor
Parameters
arrayinput array, the result of scan is stored in the same array
beginthe first element in the array to be scanned
endthe last element in the array to be scanned
reductionfunctor implementing the reduction operation
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

◆ distributedInplaceInclusiveScan() [2/2]

template<typename DistributedArray , typename Reduction = TNL::Plus>
void TNL::Algorithms::distributedInplaceInclusiveScan ( DistributedArray & array,
typename DistributedArray::IndexType begin = 0,
typename DistributedArray::IndexType end = 0,
Reduction && reduction = TNL::Plus{} )

Overload of distributedInplaceInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename DistributedArray::ValueType >(). See distributedInplaceInclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to array.getSize().

◆ equal()

template<typename DestinationDevice , typename SourceDevice = DestinationDevice, typename DestinationElement , typename SourceElement , typename Index >
bool TNL::Algorithms::equal ( DestinationElement * destination,
const SourceElement * source,
Index size )

Compares memory from source with destination.

The source data is allocated on the device specified by SourceDevice and the destination data is allocated on the device specified by DestinationDevice.

Template Parameters
DestinationDeviceis the device where the destination data is allocated.
SourceDeviceis the device where the source data is allocated.
DestinationElementis the type of the destination data.
SourceElementis the type of the source data.
Indexis the type of the size of the data.
Parameters
destinationis the pointer to the destination data.
sourceis the pointer to the source data.
sizeis the size of the data.
Returns
true if all elements are equal, false otherwise.

◆ exclusiveScan() [1/2]

template<typename InputArray , typename OutputArray , typename Reduction >
void TNL::Algorithms::exclusiveScan ( const InputArray & input,
OutputArray & output,
typename InputArray::IndexType begin,
typename InputArray::IndexType end,
typename OutputArray::IndexType outputBegin,
Reduction && reduction,
typename OutputArray::ValueType identity )

Computes an exclusive scan (or prefix sum) of an input array and stores it in an output array.

Exclusive scan (or prefix sum) operation turns a sequence \(a_1, \ldots, a_n\) into a sequence \(\sigma_1, \ldots, \sigma_n\) defined as

\[ \sigma_i = \sum_{j=1}^{i-1} a_i. \]

Template Parameters
InputArraytype of the array to be scanned
OutputArraytype of the output array
Reductiontype of the reduction functor
Parameters
inputthe input array to be scanned
outputthe array where the result will be stored
beginthe first element in the array to be scanned
endthe last element in the array to be scanned
outputBeginthe first element in the output array to be written. There must be at least end - begin elements in the output array starting at the position given by outputBegin.
reductionfunctor implementing the reduction operation
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
Example
#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/scan.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
int
main( int argc, char* argv[] )
{
/***
* Firstly, test the prefix sum with an array allocated on CPU.
*/
Array< double, Devices::Host > host_input( 10 ), host_output( 10 );
host_input = 1.0;
std::cout << "host_input = " << host_input << std::endl;
exclusiveScan( host_input, host_output );
std::cout << "host_output " << host_output << std::endl;
/***
* And then also on GPU.
*/
#ifdef __CUDACC__
Array< double, Devices::Cuda > cuda_input( 10 ), cuda_output( 10 );
cuda_input = 1.0;
std::cout << "cuda_input = " << cuda_input << std::endl;
exclusiveScan( cuda_input, cuda_output );
std::cout << "cuda_output " << cuda_output << std::endl;
#endif
return EXIT_SUCCESS;
}
void exclusiveScan(const InputArray &input, OutputArray &output, typename InputArray::IndexType begin, typename InputArray::IndexType end, typename OutputArray::IndexType outputBegin, Reduction &&reduction, typename OutputArray::ValueType identity)
Computes an exclusive scan (or prefix sum) of an input array and stores it in an output array.
Definition scan.h:143
Output
host_input = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
host_output [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
cuda_input = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
cuda_output [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]

◆ exclusiveScan() [2/2]

template<typename InputArray , typename OutputArray , typename Reduction = TNL::Plus>
void TNL::Algorithms::exclusiveScan ( const InputArray & input,
OutputArray & output,
typename InputArray::IndexType begin = 0,
typename InputArray::IndexType end = 0,
typename OutputArray::IndexType outputBegin = 0,
Reduction && reduction = TNL::Plus{} )

Overload of exclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename OutputArray::ValueType >(). See exclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to input.getSize().

◆ fill()

template<typename Device , typename Element , typename Index >
void TNL::Algorithms::fill ( Element * data,
const Element & value,
Index size )

Fills memory between data and data + size with a value.

Template Parameters
Deviceis the device where the data is allocated.
Elementis the type of the data.
Indexis the type of the size of the data.
Parameters
datais the pointer to the memory where the value will be set.
valueis the value to be filled.
sizeis the size of the data.

◆ fillRandom()

template<typename Device , typename Element , typename Index >
void TNL::Algorithms::fillRandom ( Element * data,
Index size,
Element min_val,
Element max_val )

Fills memory between data and data + size with random Element values in the given range.

Template Parameters
Deviceis the device where the data is allocated.
Elementis the type of the data.
Indexis the type of the size of the data.
Parameters
datais the pointer to the memory where the random values will be set.
sizeis the size of the data.
min_valis the minimum random value
max_valis the maximum random value

◆ find()

template<typename Container , typename ValueType >
std::pair< bool, typename Container::IndexType > TNL::Algorithms::find ( const Container & container,
const ValueType & value )

Find the first occurrence of a value in an array.

Template Parameters
Containeris the type of the container.
ValueTypeis the type of the value to be found.
IndexTypeis the type used for indexing.
Parameters
containeris the array where the value is searched.
valueis the value to be found.
Returns
pair (found, position) where found is a boolean indicating if the value was found and position is the position of the first occurrence in the container.

◆ inclusiveScan() [1/2]

template<typename InputArray , typename OutputArray , typename Reduction >
void TNL::Algorithms::inclusiveScan ( const InputArray & input,
OutputArray & output,
typename InputArray::IndexType begin,
typename InputArray::IndexType end,
typename OutputArray::IndexType outputBegin,
Reduction && reduction,
typename OutputArray::ValueType identity )

Computes an inclusive scan (or prefix sum) of an input array and stores it in an output array.

Inclusive scan (or prefix sum) operation turns a sequence \(a_1, \ldots, a_n\) into a sequence \(s_1, \ldots, s_n\) defined as

\[ s_i = \sum_{j=1}^i a_i. \]

Template Parameters
InputArraytype of the array to be scanned
OutputArraytype of the output array
Reductiontype of the reduction functor
Parameters
inputthe input array to be scanned
outputthe array where the result will be stored
beginthe first element in the array to be scanned
endthe last element in the array to be scanned
outputBeginthe first element in the output array to be written. There must be at least end - begin elements in the output array starting at the position given by outputBegin.
reductionfunctor implementing the reduction operation
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
Example
#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/scan.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
int
main( int argc, char* argv[] )
{
/***
* Firstly, test the prefix sum with an array allocated on CPU.
*/
Array< double, Devices::Host > host_input( 10 ), host_output( 10 );
host_input = 1.0;
std::cout << "host_input = " << host_input << std::endl;
inclusiveScan( host_input, host_output );
std::cout << "host_output " << host_output << std::endl;
/***
* And then also on GPU.
*/
#ifdef __CUDACC__
Array< double, Devices::Cuda > cuda_input( 10 ), cuda_output( 10 );
cuda_input = 1.0;
std::cout << "cuda_input = " << cuda_input << std::endl;
inclusiveScan( cuda_input, cuda_output );
std::cout << "cuda_output " << cuda_output << std::endl;
#endif
return EXIT_SUCCESS;
}
void inclusiveScan(const InputArray &input, OutputArray &output, typename InputArray::IndexType begin, typename InputArray::IndexType end, typename OutputArray::IndexType outputBegin, Reduction &&reduction, typename OutputArray::ValueType identity)
Computes an inclusive scan (or prefix sum) of an input array and stores it in an output array.
Definition scan.h:57
Output
host_input = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
host_output [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ]
cuda_input = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
cuda_output [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ]

◆ inclusiveScan() [2/2]

template<typename InputArray , typename OutputArray , typename Reduction = TNL::Plus>
void TNL::Algorithms::inclusiveScan ( const InputArray & input,
OutputArray & output,
typename InputArray::IndexType begin = 0,
typename InputArray::IndexType end = 0,
typename OutputArray::IndexType outputBegin = 0,
Reduction && reduction = TNL::Plus{} )

Overload of inclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename OutputArray::ValueType >(). See inclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to input.getSize().

◆ inplaceExclusiveScan() [1/2]

template<typename Array , typename Reduction >
void TNL::Algorithms::inplaceExclusiveScan ( Array & array,
typename Array::IndexType begin,
typename Array::IndexType end,
Reduction && reduction,
typename Array::ValueType identity )

Computes an exclusive scan (or prefix sum) of an array in-place.

Exclusive scan (or prefix sum) operation turns a sequence \(a_1, \ldots, a_n\) into a sequence \(\sigma_1, \ldots, \sigma_n\) defined as

\[ \sigma_i = \sum_{j=1}^{i-1} a_i. \]

Template Parameters
Arraytype of the array to be scanned
Reductiontype of the reduction functor
Parameters
arrayinput array, the result of scan is stored in the same array
beginthe first element in the array to be scanned
endthe last element in the array to be scanned
reductionfunctor implementing the reduction operation
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
Example
#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/scan.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
int
main( int argc, char* argv[] )
{
/***
* Firstly, test the prefix sum with an array allocated on CPU.
*/
host_a = 1.0;
std::cout << "host_a = " << host_a << std::endl;
std::cout << "The prefix sum of the host array is " << host_a << "." << std::endl;
/***
* And then also on GPU.
*/
#ifdef __CUDACC__
cuda_a = 1.0;
std::cout << "cuda_a = " << cuda_a << std::endl;
std::cout << "The prefix sum of the CUDA array is " << cuda_a << "." << std::endl;
#endif
return EXIT_SUCCESS;
}
void inplaceExclusiveScan(Array &array, typename Array::IndexType begin, typename Array::IndexType end, Reduction &&reduction, typename Array::ValueType identity)
Computes an exclusive scan (or prefix sum) of an array in-place.
Definition scan.h:295
Output
host_a = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
The prefix sum of the host array is [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ].
cuda_a = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
The prefix sum of the CUDA array is [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ].

◆ inplaceExclusiveScan() [2/2]

template<typename Array , typename Reduction = TNL::Plus>
void TNL::Algorithms::inplaceExclusiveScan ( Array & array,
typename Array::IndexType begin = 0,
typename Array::IndexType end = 0,
Reduction && reduction = TNL::Plus{} )

Overload of inplaceExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename Array::ValueType >(). See inplaceExclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to array.getSize().

◆ inplaceInclusiveScan() [1/2]

template<typename Array , typename Reduction >
void TNL::Algorithms::inplaceInclusiveScan ( Array & array,
typename Array::IndexType begin,
typename Array::IndexType end,
Reduction && reduction,
typename Array::ValueType identity )

Computes an inclusive scan (or prefix sum) of an array in-place.

Inclusive scan (or prefix sum) operation turns a sequence \(a_1, \ldots, a_n\) into a sequence \(s_1, \ldots, s_n\) defined as

\[ s_i = \sum_{j=1}^i a_i. \]

Template Parameters
Arraytype of the array to be scanned
Reductiontype of the reduction functor
Parameters
arrayinput array, the result of scan is stored in the same array
beginthe first element in the array to be scanned
endthe last element in the array to be scanned
reductionfunctor implementing the reduction operation
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
Example
#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/scan.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
int
main( int argc, char* argv[] )
{
/***
* Firstly, test the prefix sum with an array allocated on CPU.
*/
host_a = 1.0;
std::cout << "host_a = " << host_a << std::endl;
std::cout << "The prefix sum of the host array is " << host_a << "." << std::endl;
/***
* And then also on GPU.
*/
#ifdef __CUDACC__
cuda_a = 1.0;
std::cout << "cuda_a = " << cuda_a << std::endl;
std::cout << "The prefix sum of the CUDA array is " << cuda_a << "." << std::endl;
#endif
return EXIT_SUCCESS;
}
void inplaceInclusiveScan(Array &array, typename Array::IndexType begin, typename Array::IndexType end, Reduction &&reduction, typename Array::ValueType identity)
Computes an inclusive scan (or prefix sum) of an array in-place.
Definition scan.h:223
Output
host_a = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
The prefix sum of the host array is [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ].
cuda_a = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
The prefix sum of the CUDA array is [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ].

◆ inplaceInclusiveScan() [2/2]

template<typename Array , typename Reduction = TNL::Plus>
void TNL::Algorithms::inplaceInclusiveScan ( Array & array,
typename Array::IndexType begin = 0,
typename Array::IndexType end = 0,
Reduction && reduction = TNL::Plus{} )

Overload of inplaceInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename Array::ValueType >(). See inplaceInclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to array.getSize().

◆ isAscending()

template<typename Array >
bool TNL::Algorithms::isAscending ( const Array & arr)

Functions returning true if the array elements are sorted in ascending order.

Template Parameters
Arrayis the type of array/vector. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Parameters
arris an instance of tested array.
Returns
true if the array is sorted in ascending order.
false if the array is NOT sorted in ascending order.

◆ isDescending()

template<typename Array >
bool TNL::Algorithms::isDescending ( const Array & arr)

Functions returning true if the array elements are sorted in descending order.

Template Parameters
Arrayis the type of array/vector. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Parameters
arris an instance of tested array.
Returns
true if the array is sorted in descending order.
false if the array is NOT sorted in descending order.

◆ isSorted()

template<typename Array , typename Compare >
bool TNL::Algorithms::isSorted ( const Array & arr,
const Compare & compare )

Functions returning true if the array elements are sorted according to the lmabda function comparison.

Template Parameters
Arrayis the type of array/vector. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Compareis a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given by indices representing their positions. The lambda function is supposed to be defined as follows:
auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... };
Parameters
arris an instance of tested array.
compareis an instance of the lambda function for elements comparison.
Returns
true if the array is sorted in ascending order.
false if the array is NOT sorted in ascending order.

◆ parallelFor() [1/2]

template<typename Device , typename Begin , typename End , typename Function , typename... FunctionArgs>
std::enable_if_t< std::is_integral_v< Begin > &&std::is_integral_v< End > > TNL::Algorithms::parallelFor ( const Begin & begin,
const End & end,
typename Device::LaunchConfiguration launch_config,
Function f,
FunctionArgs... args )

Parallel for-loop function for 1D range specified with integral values.

Template Parameters
Deviceis a type of the device where the reduction will be performed.
Beginmust be an integral type.
Endmust be an integral type.
Parameters
beginis the left bound of the iteration range [begin, end).
endis the right bound of the iteration range [begin, end).
fis the function to be called in each iteration. Arguments of the function are the iteration index and arguments from the args... variadic pack.
launch_configspecifies kernel launch parameters.
argsare additional parameters to be passed to the function f.
Example
#include <iostream>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/parallelFor.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
/****
* Set all elements of the vector v to the constant c.
*/
template< typename Device >
void
initVector( Vector< double, Device >& v, const double& c )
{
auto view = v.getView();
auto init = [ = ] __cuda_callable__( int i ) mutable
{
view[ i ] = c;
};
parallelFor< Device >( 0, v.getSize(), init );
}
int
main( int argc, char* argv[] )
{
/***
* Firstly, test the vector initiation on CPU.
*/
initVector( host_v, 1.0 );
std::cout << "host_v = " << host_v << std::endl;
/***
* And then also on GPU.
*/
#ifdef __CUDACC__
initVector( cuda_v, 1.0 );
std::cout << "cuda_v = " << cuda_v << std::endl;
#endif
return EXIT_SUCCESS;
}
__cuda_callable__ IndexType getSize() const
Returns the current array size.
Definition Array.hpp:245
Vector extends Array with algebraic operations.
Definition Vector.h:36
ViewType getView(IndexType begin=0, IndexType end=0)
Returns a modifiable view of the vector.
Definition Vector.hpp:25
Output
host_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
cuda_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

◆ parallelFor() [2/2]

template<typename Device , typename Begin , typename End , typename Function , typename... FunctionArgs>
std::enable_if_t< IsStaticArrayType< Begin >::value &&IsStaticArrayType< End >::value > TNL::Algorithms::parallelFor ( const Begin & begin,
const End & end,
typename Device::LaunchConfiguration launch_config,
Function f,
FunctionArgs... args )

Parallel for-loop function for range specified with multi-index values.

Template Parameters
Deviceis a type of the device where the reduction will be performed.
Beginmust satisfy the constraints checked by the TNL::IsStaticArrayType type trait.
Endmust satisfy the constraints checked by the TNL::IsStaticArrayType type trait.
Parameters
beginis the left bound of the iteration range [begin, end).
endis the right bound of the iteration range [begin, end).
fis the function to be called in each iteration. Arguments of the function are the iteration multi-index, which is an instance of the End type, and arguments from the args... variadic pack.
launch_configspecifies kernel launch parameters.
argsare additional parameters to be passed to the function f.
Example
#include <iostream>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/StaticArray.h>
#include <TNL/Algorithms/parallelFor.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
template< typename Device >
void
initMeshFunction( const int xSize, const int ySize, const int zSize, Vector< double, Device >& v, const double& c )
{
auto view = v.getView();
auto init = [ = ] __cuda_callable__( const StaticArray< 3, int >& i ) mutable
{
view[ ( i.z() * ySize + i.y() ) * xSize + i.x() ] = c;
};
StaticArray< 3, int > begin{ 0, 0, 0 };
StaticArray< 3, int > end{ xSize, ySize, zSize };
parallelFor< Device >( begin, end, init );
}
int
main( int argc, char* argv[] )
{
/***
* Define dimensions of a 3D mesh function.
*/
const int xSize( 10 ), ySize( 10 ), zSize( 10 );
const int size = xSize * ySize * zSize;
/***
* Firstly, test the mesh function initiation on CPU.
*/
initMeshFunction( xSize, ySize, zSize, host_v, 1.0 );
/***
* And then also on GPU.
*/
#ifdef __CUDACC__
initMeshFunction( xSize, ySize, zSize, cuda_v, 1.0 );
#endif
return EXIT_SUCCESS;
}
Array with constant size.
Definition StaticArray.h:20
Output

◆ reduce() [1/4]

template<typename Array , typename Device = typename Array::DeviceType, typename Reduction , typename Result >
auto TNL::Algorithms::reduce ( const Array & array,
Reduction && reduction,
Result identity )

Variant of reduce for arrays, views and compatible objects.

The referenced reduce function is called with:

  • Device, which is typename Array::DeviceType by default, as the Device type,
  • 0 as the beginning of the interval for reduction,
  • array.getSize() as the end of the interval for reduction,
  • array.getConstView() as the fetch functor,
  • reduction as the reduction operation,
  • and identity as the identity element of the reduction.
Example
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/reduce.h>
using namespace TNL;
template< typename Device >
void
reduceArrayExample()
{
/****
* Create new arrays
*/
const int size = 10;
/****
* Initiate the elements of array `a`
*/
a.forAllElements(
[] __cuda_callable__( int i, float& value )
{
value = i;
} );
/****
* Sum all elements of array `a`
*/
float sum_total = Algorithms::reduce( a, TNL::Plus{} );
/****
* Sum last 5 elements of array `a`
*/
float sum_last_five = Algorithms::reduce( a.getConstView( 5, 10 ), TNL::Plus{} );
/****
* Print the results
*/
std::cout << " a = " << a << std::endl;
std::cout << " sum of all elements = " << sum_total << std::endl;
std::cout << " sum of last 5 elements = " << sum_last_five << std::endl;
}
int
main( int argc, char* argv[] )
{
std::cout << "Running example on the host system: " << std::endl;
reduceArrayExample< Devices::Host >();
#ifdef __CUDACC__
std::cout << "Running example on the CUDA device: " << std::endl;
reduceArrayExample< Devices::Cuda >();
#endif
}
Function object implementing x + y.
Definition Functional.h:17
Output
Running example on the host system:
a = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
sum of all elements = 45
sum of last 5 elements = 35
Running example on the CUDA device:
a = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
sum of all elements = 45
sum of last 5 elements = 35

◆ reduce() [2/4]

template<typename Array , typename Device = typename Array::DeviceType, typename Reduction = TNL::Plus>
auto TNL::Algorithms::reduce ( const Array & array,
Reduction && reduction = TNL::Plus{} )

Variant of reduce for arrays, views and compatible objects.

Reduction can be one of the following TNL::Plus, TNL::Multiplies, TNL::Min, TNL::Max, TNL::LogicalAnd, TNL::LogicalOr, TNL::BitAnd or TNL::BitOr. TNL::Plus is used by default.

The referenced reduce function is called with:

  • Device, which is typename Array::DeviceType by default, as the Device type,
  • 0 as the beginning of the interval for reduction,
  • array.getSize() as the end of the interval for reduction,
  • array.getConstView() as the fetch functor,
  • reduction as the reduction operation,
  • and the identity element obtained from the reduction functional object.
Example
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/reduce.h>
using namespace TNL;
template< typename Device >
void
reduceArrayExample()
{
/****
* Create new arrays
*/
const int size = 10;
/****
* Initiate the elements of array `a`
*/
a.forAllElements(
[] __cuda_callable__( int i, float& value )
{
value = i;
} );
/****
* Sum all elements of array `a`
*/
float sum_total = Algorithms::reduce( a, TNL::Plus{} );
/****
* Sum last 5 elements of array `a`
*/
float sum_last_five = Algorithms::reduce( a.getConstView( 5, 10 ), TNL::Plus{} );
/****
* Print the results
*/
std::cout << " a = " << a << std::endl;
std::cout << " sum of all elements = " << sum_total << std::endl;
std::cout << " sum of last 5 elements = " << sum_last_five << std::endl;
}
int
main( int argc, char* argv[] )
{
std::cout << "Running example on the host system: " << std::endl;
reduceArrayExample< Devices::Host >();
#ifdef __CUDACC__
std::cout << "Running example on the CUDA device: " << std::endl;
reduceArrayExample< Devices::Cuda >();
#endif
}
Output
Running example on the host system:
a = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
sum of all elements = 45
sum of last 5 elements = 35
Running example on the CUDA device:
a = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
sum of all elements = 45
sum of last 5 elements = 35

◆ reduce() [3/4]

template<typename Device , typename Index , typename Result , typename Fetch , typename Reduction >
Result TNL::Algorithms::reduce ( Index begin,
Index end,
Fetch && fetch,
Reduction && reduction,
const Result & identity )

reduce implements (parallel) reduction for vectors and arrays.

Reduction can be used for operations having one or more vectors (or arrays) elements as input and returning one number (or element) as output. Some examples of such operations can be vectors/arrays comparison, vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even the position of the smallest or the largest element, the function reduceWithArgument can be used.

Template Parameters
Deviceis a type of the device where the reduction will be performed.
Indexis a type for indexing.
Resultis a type of the reduction result.
Fetchis a lambda function for fetching the input data.
Reductionis a lambda function performing the reduction.

Device can be on of the following TNL::Devices::Sequential, TNL::Devices::Host and TNL::Devices::Cuda.

Parameters
begindefines range [begin, end) of indexes which will be used for the reduction.
enddefines range [begin, end) of indexes which will be used for the reduction.
fetchis a lambda function fetching the input data.
reductionis a lambda function defining the reduction operation.
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.
Returns
result of the reduction

The fetch lambda function takes one argument which is index of the element to be fetched:

auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };

The reduction lambda function takes two variables which are supposed to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
Example
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
template< typename Device >
double
sum( const Vector< double, Device >& v )
{
/****
* Get vector view which can be captured by lambda.
*/
auto view = v.getConstView();
/****
* The fetch function just reads elements of vector v.
*/
auto fetch = [ = ] __cuda_callable__( int i ) -> double
{
return view[ i ];
};
/***
* Reduction is sum of two numbers.
*/
auto reduction = [] __cuda_callable__( const double& a, const double& b )
{
return a + b;
};
/***
* Finally we call the templated function Reduction and pass number of elements to reduce,
* lambdas defined above and finally value of identity element, zero in this case, which serve for the
* reduction initiation.
*/
return reduce< Device >( 0, view.getSize(), fetch, reduction, 0.0 );
}
int
main( int argc, char* argv[] )
{
/***
* Firstly, test the sum with vectors allocated on CPU.
*/
host_v = 1.0;
std::cout << "host_v = " << host_v << std::endl;
std::cout << "The sum of the host vector elements is " << sum( host_v ) << "." << std::endl;
/***
* And then also on GPU.
*/
#ifdef __CUDACC__
cuda_v = 1.0;
std::cout << "cuda_v = " << cuda_v << std::endl;
std::cout << "The sum of the CUDA vector elements is " << sum( cuda_v ) << "." << std::endl;
#endif
return EXIT_SUCCESS;
}
ConstViewType getConstView(IndexType begin=0, IndexType end=0) const
Returns a non-modifiable view of the vector.
Definition Vector.hpp:40
Output
host_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
The sum of the host vector elements is 10.
cuda_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
The sum of the CUDA vector elements is 10.

◆ reduce() [4/4]

template<typename Device , typename Index , typename Fetch , typename Reduction = TNL::Plus>
auto TNL::Algorithms::reduce ( Index begin,
Index end,
Fetch && fetch,
Reduction && reduction = TNL::Plus{} )

Variant of reduce with functional instead of reduction lambda function.

Template Parameters
Deviceis a type of the device where the reduction will be performed.
Indexis a type for indexing.
Fetchis a lambda function for fetching the input data.
Reductionis a functional performing the reduction.

Device can be on of the following TNL::Devices::Sequential, TNL::Devices::Host and TNL::Devices::Cuda.

Reduction can be one of the following TNL::Plus, TNL::Multiplies, TNL::Min, TNL::Max, TNL::LogicalAnd, TNL::LogicalOr, TNL::BitAnd or TNL::BitOr. TNL::Plus is used by default.

Parameters
begindefines range [begin, end) of indexes which will be used for the reduction.
enddefines range [begin, end) of indexes which will be used for the reduction.
fetchis a lambda function fetching the input data.
reductionis a lambda function defining the reduction operation.
Returns
result of the reduction

The fetch lambda function takes one argument which is index of the element to be fetched:

auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
Example
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
template< typename Device >
double
sum( const Vector< double, Device >& v )
{
/****
* Get vector view which can be captured by lambda.
*/
auto view = v.getConstView();
/****
* The fetch function just reads elements of vector v.
*/
auto fetch = [ = ] __cuda_callable__( int i ) -> double
{
return view[ i ];
};
/***
* Finally we call the templated function Reduction and pass number of elements to reduce,
* lambda defined above and functional representing the reduction operation.
*/
return reduce< Device >( 0, view.getSize(), fetch, TNL::Plus{} );
}
int
main( int argc, char* argv[] )
{
/***
* Firstly, test the sum with vectors allocated on CPU.
*/
host_v = 1.0;
std::cout << "host_v = " << host_v << std::endl;
std::cout << "The sum of the host vector elements is " << sum( host_v ) << "." << std::endl;
/***
* And then also on GPU.
*/
#ifdef __CUDACC__
cuda_v = 1.0;
std::cout << "cuda_v = " << cuda_v << std::endl;
std::cout << "The sum of the CUDA vector elements is " << sum( cuda_v ) << "." << std::endl;
#endif
return EXIT_SUCCESS;
}
Output
host_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
The sum of the host vector elements is 10.
cuda_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]
The sum of the CUDA vector elements is 10.

◆ reduceWithArgument() [1/4]

template<typename Array , typename Device = typename Array::DeviceType, typename Reduction >
auto TNL::Algorithms::reduceWithArgument ( const Array & array,
Reduction && reduction )

Variant of reduceWithArgument for arrays, views and compatible objects.

Reduction can be one of TNL::MinWithArg, TNL::MaxWithArg.

The referenced reduceWithArgument function is called with:

  • Device, which is typename Array::DeviceType by default, as the Device type,
  • 0 as the beginning of the interval for reduction,
  • array.getSize() as the end of the interval for reduction,
  • array.getConstView() as the fetch functor,
  • reduction as the reduction operation,
  • and the identity element obtained from the reduction functional object.
Example
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
using namespace TNL;
template< typename Device >
void
reduceArrayExample()
{
/****
* Create new arrays
*/
const int size = 10;
/****
* Initiate the elements of array `a`
*/
a.forAllElements(
[] __cuda_callable__( int i, float& value )
{
value = 3 - i;
} );
/****
* Reduce all elements of array `a`
*/
std::pair< float, int > result_total = Algorithms::reduceWithArgument( TNL::abs( a ), TNL::MaxWithArg{} );
/****
* Print the results
*/
std::cout << " a = " << a << std::endl;
std::cout << " abs-max of all elements = " << result_total.first << " at position " << result_total.second << std::endl;
}
int
main( int argc, char* argv[] )
{
std::cout << "Running example on the host system: " << std::endl;
reduceArrayExample< Devices::Host >();
#ifdef __CUDACC__
std::cout << "Running example on the CUDA device: " << std::endl;
reduceArrayExample< Devices::Cuda >();
#endif
}
Function object implementing argmax(x, y, i, j) for use with TNL::Algorithms::reduceWithArgument.
Definition Functional.h:309
Output
Running example on the host system:
a = [ 3, 2, 1, 0, -1, -2, -3, -4, -5, -6 ]
abs-max of all elements = 6 at position 9
Running example on the CUDA device:
a = [ 3, 2, 1, 0, -1, -2, -3, -4, -5, -6 ]
abs-max of all elements = 6 at position 9

◆ reduceWithArgument() [2/4]

template<typename Array , typename Device = typename Array::DeviceType, typename Reduction , typename Result >
auto TNL::Algorithms::reduceWithArgument ( const Array & array,
Reduction && reduction,
Result identity )

Variant of reduceWithArgument for arrays, views and compatible objects.

The referenced reduceWithArgument function is called with:

  • Device, which is typename Array::DeviceType by default, as the Device type,
  • 0 as the beginning of the interval for reduction,
  • array.getSize() as the end of the interval for reduction,
  • array.getConstView() as the fetch functor,
  • reduction as the reduction operation,
  • and identity as the identity element of the reduction.
Example
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
using namespace TNL;
template< typename Device >
void
reduceArrayExample()
{
/****
* Create new arrays
*/
const int size = 10;
/****
* Initiate the elements of array `a`
*/
a.forAllElements(
[] __cuda_callable__( int i, float& value )
{
value = 3 - i;
} );
/****
* Reduce all elements of array `a`
*/
std::pair< float, int > result_total = Algorithms::reduceWithArgument( TNL::abs( a ), TNL::MaxWithArg{} );
/****
* Print the results
*/
std::cout << " a = " << a << std::endl;
std::cout << " abs-max of all elements = " << result_total.first << " at position " << result_total.second << std::endl;
}
int
main( int argc, char* argv[] )
{
std::cout << "Running example on the host system: " << std::endl;
reduceArrayExample< Devices::Host >();
#ifdef __CUDACC__
std::cout << "Running example on the CUDA device: " << std::endl;
reduceArrayExample< Devices::Cuda >();
#endif
}
Output
Running example on the host system:
a = [ 3, 2, 1, 0, -1, -2, -3, -4, -5, -6 ]
abs-max of all elements = 6 at position 9
Running example on the CUDA device:
a = [ 3, 2, 1, 0, -1, -2, -3, -4, -5, -6 ]
abs-max of all elements = 6 at position 9

◆ reduceWithArgument() [3/4]

template<typename Device , typename Index , typename Fetch , typename Reduction >
auto TNL::Algorithms::reduceWithArgument ( Index begin,
Index end,
Fetch && fetch,
Reduction && reduction )

Variant of reduceWithArgument with functional instead of reduction lambda function.

Template Parameters
Deviceis a type of the device where the reduction will be performed.
Indexis a type for indexing.
Resultis a type of the reduction result.
Reductionis a functional performing the reduction.
Fetchis a lambda function for fetching the input data.

Device can be on of the following TNL::Devices::Sequential, TNL::Devices::Host and TNL::Devices::Cuda.

Reduction can be one of TNL::MinWithArg, TNL::MaxWithArg.

Parameters
begindefines range [begin, end) of indexes which will be used for the reduction.
enddefines range [begin, end) of indexes which will be used for the reduction.
fetchis a lambda function fetching the input data.
reductionis a lambda function defining the reduction operation and managing the elements positions.
Returns
result of the reduction in a form of std::pair< Index, Result> structure. pair.first is the element position and pair.second is the reduction result.

The fetch lambda function takes one argument which is index of the element to be fetched:

auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };

The reduction lambda function takes two variables which are supposed to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
Example
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
template< typename Device >
maximumNorm( const Vector< double, Device >& v )
{
auto view = v.getConstView();
auto fetch = [ = ] __cuda_callable__( int i )
{
return abs( view[ i ] );
};
return reduceWithArgument< Device >( 0, view.getSize(), fetch, TNL::MaxWithArg{} );
}
int
main( int argc, char* argv[] )
{
host_v.forAllElements(
[] __cuda_callable__( int i, double& value )
{
value = i - 7;
} );
std::cout << "host_v = " << host_v << std::endl;
auto maxNormHost = maximumNorm( host_v );
std::cout << "The maximum norm of the host vector elements is " << maxNormHost.first << " at position " << maxNormHost.second
<< "." << std::endl;
#ifdef __CUDACC__
cuda_v.forAllElements(
[] __cuda_callable__( int i, double& value )
{
value = i - 7;
} );
std::cout << "cuda_v = " << cuda_v << std::endl;
auto maxNormCuda = maximumNorm( cuda_v );
std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.first << " at position "
<< maxNormCuda.second << "." << std::endl;
#endif
return EXIT_SUCCESS;
}
Output
host_v = [ -7, -6, -5, -4, -3, -2, -1, 0, 1, 2 ]
The maximum norm of the host vector elements is 7 at position 0.
cuda_v = [ -7, -6, -5, -4, -3, -2, -1, 0, 1, 2 ]
The maximum norm of the device vector elements is 7 at position 0.

◆ reduceWithArgument() [4/4]

template<typename Device , typename Index , typename Result , typename Fetch , typename Reduction >
std::pair< Result, Index > TNL::Algorithms::reduceWithArgument ( Index begin,
Index end,
Fetch && fetch,
Reduction && reduction,
const Result & identity )

Variant of reduce returning also the position of the element of interest.

For example, in case of computing minimal or maximal element in array/vector, the position of the element having given value can be obtained. This method is, however, more flexible.

Template Parameters
Deviceis a type of the device where the reduction will be performed.
Indexis a type for indexing.
Resultis a type of the reduction result.
Reductionis a lambda function performing the reduction.
Fetchis a lambda function for fetching the input data.

Device can be on of the following TNL::Devices::Sequential, TNL::Devices::Host and TNL::Devices::Cuda.

Parameters
begindefines range [begin, end) of indexes which will be used for the reduction.
enddefines range [begin, end) of indexes which will be used for the reduction.
fetchis a lambda function fetching the input data.
reductionis a lambda function defining the reduction operation and managing the elements positions.
identityis the identity element for the reduction operation, i.e. element which does not change the result of the reduction.
Returns
result of the reduction in a form of std::pair< Index, Result> structure. pair.first is the element position and pair.second is the reduction result.

The fetch lambda function takes one argument which is index of the element to be fetched:

auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };

The reduction lambda function takes two variables which are supposed to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
Example
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
template< typename Device >
maximumNorm( const Vector< double, Device >& v )
{
auto view = v.getConstView();
auto fetch = [ = ] __cuda_callable__( int i )
{
return abs( view[ i ] );
};
auto reduction = [] __cuda_callable__( double& a, const double& b, int& aIdx, const int& bIdx )
{
if( a < b ) {
a = b;
aIdx = bIdx;
}
else if( a == b && bIdx < aIdx )
aIdx = bIdx;
};
return reduceWithArgument< Device >( 0, view.getSize(), fetch, reduction, std::numeric_limits< double >::lowest() );
}
int
main( int argc, char* argv[] )
{
host_v.forAllElements(
[] __cuda_callable__( int i, double& value )
{
value = i - 7;
} );
std::cout << "host_v = " << host_v << std::endl;
auto maxNormHost = maximumNorm( host_v );
std::cout << "The maximum norm of the host vector elements is " << maxNormHost.first << " at position " << maxNormHost.second
<< "." << std::endl;
#ifdef __CUDACC__
cuda_v.forAllElements(
[] __cuda_callable__( int i, double& value )
{
value = i - 7;
} );
std::cout << "cuda_v = " << cuda_v << std::endl;
auto maxNormCuda = maximumNorm( cuda_v );
std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.first << " at position "
<< maxNormCuda.second << "." << std::endl;
#endif
return EXIT_SUCCESS;
}
Output
host_v = [ -7, -6, -5, -4, -3, -2, -1, 0, 1, 2 ]
The maximum norm of the host vector elements is 7 at position 0.
cuda_v = [ -7, -6, -5, -4, -3, -2, -1, 0, 1, 2 ]
The maximum norm of the device vector elements is 7 at position 0.

◆ sort() [1/2]

template<typename Array , typename Compare , typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>
void TNL::Algorithms::sort ( Array & array,
const Compare & compare,
const Sorter & sorter = Sorter{} )

Function for sorting elements of array or vector based on a user defined comparison lambda function.

Template Parameters
Arrayis a type of container to be sorted. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Compareis a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second. The lambda function is supposed to be defined as follows (ValueType is type of the array elements):
auto compare = [] __cuda_callable__ ( const ValueType& a , const ValueType& b ) -> bool { return .... };
Sorteris an algorithm for sorting. It can be TNL::Algorithms::Sorting::STLSort for sorting on host and TNL::Algorithms::Sorting::Quicksort or TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU.
Parameters
arrayis an instance of array/array view/vector/vector view for sorting.
compareis an instance of the lambda function for comparison of two elements.
sorteris an instance of sorter.
Example
1#include <iostream>
2#include <TNL/Containers/Array.h>
3#include <TNL/Algorithms/sort.h>
4
5using namespace TNL;
6using namespace TNL::Containers;
7using namespace TNL::Algorithms;
8
9template< typename ArrayT >
10void
11sort( ArrayT& array )
12{
13 const int size = 10;
14
15 /****
16 * Fill the array with random integers.
17 */
18 Array< int > aux_array( size );
19 srand( size + 2021 );
20 parallelFor< Devices::Host >( 0,
21 size,
22 [ & ]( int i )
23 {
24 aux_array[ i ] = std::rand() % ( 2 * size );
25 } );
26 array = aux_array;
27
28 std::cout << "Random array: " << array << std::endl;
29
30 /****
31 * Sort the array in ascending order.
32 */
33 sort( array,
34 [] __cuda_callable__( int a, int b )
35 {
36 return a < b;
37 } );
38 std::cout << "Array sorted in ascending order:" << array << std::endl;
39
40 /***
41 * Sort the array in descending order.
42 */
43 sort( array,
44 [] __cuda_callable__( int a, int b )
45 {
46 return a > b;
47 } );
48 std::cout << "Array sorted in descending order:" << array << std::endl;
49}
50
51int
52main( int argc, char* argv[] )
53{
54 /***
55 * Firstly, test the sorting on CPU.
56 */
57 std::cout << "Sorting on CPU ... " << std::endl;
59 sort( host_array );
60
61#ifdef __CUDACC__
62 /***
63 * And then also on GPU.
64 */
65 std::cout << "Sorting on GPU ... " << std::endl;
67 sort( cuda_array );
68#endif
69 return EXIT_SUCCESS;
70}
T sort(T... args)
Output
Sorting on CPU ...
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]
Sorting on GPU ...
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]

◆ sort() [2/2]

template<typename Device , typename Index , typename Compare , typename Swap , typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType>
void TNL::Algorithms::sort ( const Index begin,
const Index end,
Compare && compare,
Swap && swap,
const Sorter & sorter = Sorter{} )

Function for general sorting based on lambda functions for comparison and swaping of two elements..

Template Parameters
Deviceis device on which the sorting algorithms should be executed.
Indexis type used for indexing of the sorted data.
Compareis a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given by indices representing their positions. The lambda function is supposed to be defined as follows:
auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... };
Swapis a lambda function for swaping of two elements which are ordered wrong way. Both elements are represented by indices as well. It supposed to be defined as:
auto swap = [=] __cuda_callable__ ( const Index& a , const Index& b ) mutable { swap( ....); };
__cuda_callable__ constexpr void swap(Type &a, Type &b)
This function swaps values of two parameters.
Definition Math.h:496
Sorteris an algorithm for sorting. It can be TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. Currently there is no algorithm for CPU :(.
Parameters
beginis the first index of the range [begin, end) to be sorted.
endis the end index of the range [begin, end) to be sorted.
compareis an instance of the lambda function for comparison of two elements.
swapis an instance of the lambda function for swapping of two elements.
sorteris an instance of sorter.
Example
1#include <iostream>
2#include <TNL/Containers/Array.h>
3#include <TNL/Algorithms/sort.h>
4
5using namespace TNL;
6using namespace TNL::Containers;
7using namespace TNL::Algorithms;
8
9template< typename ArrayT >
10void
11sort( ArrayT& array )
12{
13 const int size = 10;
14
15 /****
16 * Fill the array with random integers.
17 */
18 Array< int > aux_array( size );
19 srand( size + 2021 );
20 parallelFor< Devices::Host >( 0,
21 size,
22 [ & ]( int i )
23 {
24 aux_array[ i ] = std::rand() % ( 2 * size );
25 } );
26 array = aux_array;
27
28 /***
29 * Prepare second array holding elements positions.
30 */
31 ArrayT index( size );
32 index.forAllElements(
33 [] __cuda_callable__( int idx, int& value )
34 {
35 value = idx;
36 } );
37 std::cout << "Random array: " << array << std::endl;
38 std::cout << "Index array: " << index << std::endl;
39
40 /***
41 * Sort the array `array` and apply the same permutation on the array `identity`.
42 */
43 auto array_view = array.getView();
44 auto index_view = index.getView();
45 sort< typename ArrayT::DeviceType, // device on which the sorting will be performed
46 typename ArrayT::IndexType >( // type used for indexing
47 0,
48 size, // range of indexes
49 [ = ] __cuda_callable__( int i, int j ) -> bool { // comparison lambda function
50 return array_view[ i ] < array_view[ j ];
51 },
52 [ = ] __cuda_callable__( int i, int j ) mutable { // lambda function for swapping of elements
53 TNL::swap( array_view[ i ], array_view[ j ] );
54 TNL::swap( index_view[ i ], index_view[ j ] );
55 } );
56 std::cout << "Sorted array: " << array << std::endl;
57 std::cout << "Index: " << index << std::endl;
58}
59
60int
61main( int argc, char* argv[] )
62{
63 /***
64 * Firstly, test the sorting on CPU.
65 */
66 std::cout << "Sorting on CPU ... " << std::endl;
68 sort( host_array );
69
70#ifdef __CUDACC__
71 /***
72 * And then also on GPU.
73 */
74 std::cout << "Sorting on GPU ... " << std::endl;
76 sort( cuda_array );
77#endif
78 return EXIT_SUCCESS;
79}
Output
Sorting on CPU ...
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Index array: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
Sorted array: [ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Index: [ 4, 1, 6, 3, 0, 9, 5, 8, 7, 2 ]
Sorting on GPU ...
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Index array: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
Sorted array: [ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Index: [ 4, 1, 6, 0, 3, 9, 5, 7, 8, 2 ]

◆ staticFor()

template<typename Index , Index begin, Index end, typename Func , typename... ArgTypes>
constexpr void TNL::Algorithms::staticFor ( Func && f,
ArgTypes &&... args )
constexpr

Generic loop with constant bounds and indices usable in constant expressions.

staticFor is a generic C++17 implementation of a static for-loop using constexpr functions and template metaprogramming. It is equivalent to executing a function f(i, args...) for arguments i from the integral range [begin, end), but with the type std::integral_constant rather than int or std::size_t representing the indices. Hence, each index has its own distinct C++ type and the value of the index can be deduced from the type. The args... are additional user-supplied arguments that are forwarded to the staticFor function.

Also note that thanks to constexpr cast operator, the argument i can be used in constant expressions and the staticFor function can be used from the host code as well as CUDA kernels (TNL requires the --expt-relaxed-constexpr parameter when compiled by nvcc).

Template Parameters
Indexis the type of the loop indices.
beginis the left bound of the iteration range [begin, end).
endis the right bound of the iteration range [begin, end).
Funcis the type of the functor (it is usually deduced from the argument used in the function call).
ArgTypesare the types of additional arguments passed to the function.
Parameters
fis the functor to be called in each iteration.
argsare additional user-supplied arguments that are forwarded to each call of f.
Example
#include <iostream>
#include <array>
#include <tuple>
#include <TNL/Algorithms/staticFor.h>
/*
* Example function printing members of std::tuple using staticFor
* using lambda with capture.
*/
template< typename... Ts >
void
printTuple( const std::tuple< Ts... >& tupleVar )
{
std::cout << "{ ";
TNL::Algorithms::staticFor< size_t, 0, sizeof...( Ts ) >(
[ & ]( auto i )
{
std::cout << std::get< i >( tupleVar );
if( i < sizeof...( Ts ) - 1 )
std::cout << ", ";
} );
std::cout << " }" << std::endl;
}
struct TuplePrinter
{
constexpr TuplePrinter() = default;
template< typename Index, typename... Ts >
void
operator()( Index i, const std::tuple< Ts... >& tupleVar )
{
std::cout << std::get< i >( tupleVar );
if( i < sizeof...( Ts ) - 1 )
std::cout << ", ";
}
};
/*
* Example function printing members of std::tuple using staticFor
* and a structure with templated operator().
*/
template< typename... Ts >
void
printTupleCallableStruct( const std::tuple< Ts... >& tupleVar )
{
std::cout << "{ ";
TNL::Algorithms::staticFor< size_t, 0, sizeof...( Ts ) >( TuplePrinter(), tupleVar );
std::cout << " }" << std::endl;
}
int
main( int argc, char* argv[] )
{
// initiate std::array
std::array< int, 5 > a{ 1, 2, 3, 4, 5 };
// print out the array using template parameters for indexing
TNL::Algorithms::staticFor< int, 0, 5 >(
[ &a ]( auto i )
{
std::cout << "a[ " << i << " ] = " << std::get< i >( a ) << std::endl;
} );
// example of printing a tuple using staticFor and a lambda function
printTuple( std::make_tuple( "Hello", 3, 2.1 ) );
// example of printing a tuple using staticFor and a structure with templated operator()
printTupleCallableStruct( std::make_tuple( "Hello", 3, 2.1 ) );
}
T make_tuple(T... args)
constexpr void staticFor(Func &&f, ArgTypes &&... args)
Generic loop with constant bounds and indices usable in constant expressions.
Definition staticFor.h:60
Output
a[ 0 ] = 1
a[ 1 ] = 2
a[ 2 ] = 3
a[ 3 ] = 4
a[ 4 ] = 5
{ Hello, 3, 2.1 }
{ Hello, 3, 2.1 }

◆ unrolledFor()

template<typename Index , Index begin, Index end, Index unrollFactor = 8, typename Func >
constexpr void TNL::Algorithms::unrolledFor ( Func && f)
constexpr

Generic for-loop with explicit unrolling.

unrolledFor performs explicit loop unrolling of short loops which can improve performance in some cases. The bounds of the for-loop must be constant (i.e. known at the compile time). Loops longer than unrollFactor are not unrolled and executed as a normal for-loop.

The unroll factor is configurable, but note that full unrolling does not make sense for very long loops. It might even trigger the compiler's limit on recursive template instantiation. Also note that the compiler will (at least partially) unroll loops with static bounds anyway.

Template Parameters
Indexis the type of the loop indices.
beginis the left bound of the iteration range [begin, end).
endis the right bound of the iteration range [begin, end).
unrollFactoris the maximum length of loops to fully unroll via recursive template instantiation.
Funcis the type of the functor (it is usually deduced from the argument used in the function call).
Parameters
fis the functor to be called in each iteration.
Example
#include <iostream>
#include <TNL/Containers/StaticVector.h>
#include <TNL/Algorithms/unrolledFor.h>
using namespace TNL;
using namespace TNL::Containers;
int
main( int argc, char* argv[] )
{
/****
* Create two static vectors
*/
const int Size( 3 );
a = 1.0;
b = 2.0;
double sum( 0.0 );
/****
* Compute an addition of a vector and a constant number.
*/
Algorithms::unrolledFor< int, 0, Size >(
[ & ]( int i )
{
a[ i ] = b[ i ] + 3.14;
sum += a[ i ];
} );
std::cout << "a = " << a << std::endl;
std::cout << "sum = " << sum << std::endl;
}
Vector with constant size.
Definition StaticVector.h:19
Output
a = [ 5.14, 5.14, 5.14 ]
sum = 15.42