Template Numerical Library: TNL::Algorithms Namespace Reference

Namespace for fundamental TNL algorithms. More...

Namespaces
namespace	Segments
	Namespace for the segments data structures.

Classes
struct	AtomicOperations

struct	AtomicOperations< Devices::Cuda >

struct	AtomicOperations< Devices::Host >

struct	AtomicOperations< Devices::Sequential >

class	CudaReductionBuffer

struct	Multireduction

struct	Multireduction< Devices::Cuda >

struct	Multireduction< Devices::Host >

struct	Multireduction< Devices::Sequential >

struct	SegmentedScan
	Computes segmented scan (or prefix sum) on a vector. More...

struct	SegmentedScan< Devices::Cuda, Type >

struct	SegmentedScan< Devices::Host, Type >

struct	SegmentedScan< Devices::Sequential, Type >

struct	SequentialFor
	Wrapper to ParallelFor which makes it run sequentially. More...

Functions
template<typename Array, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>
void	ascendingSort (Array &array, const Sorter &sorter=Sorter{})
	Function for sorting elements of array or vector in ascending order.

template<typename Array>
bool	contains (const Array &array, typename Array::ValueType value, typename Array::IndexType begin=0, typename Array::IndexType end=0)
	Checks if an array/vector/view contains an element with given value.

template<typename Array>
bool	containsOnlyValue (const Array &array, typename Array::ValueType value, typename Array::IndexType begin=0, typename Array::IndexType end=0)
	Checks if all elements of an array/vector/view have the given value.

template<typename DestinationDevice, typename SourceDevice = DestinationDevice, typename DestinationElement, typename SourceElement, typename Index>
void	copy (DestinationElement destination, const SourceElement source, Index size)
	Copies memory from source to destination.

template<typename DestinationDevice, typename DestinationElement, typename Index, typename SourceIterator>
void	copy (DestinationElement *destination, Index destinationSize, SourceIterator begin, SourceIterator end)
	Copies memory from source iterator range to destination.

template<typename Array, typename DestinationElement, typename = std::enable_if_t< IsArrayType< Array >::value >>
void	copy (std::vector< DestinationElement > &destination, const Array &source)
	Copies memory from the source TNL array-like container to the destination STL vector.

template<typename Array, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>
void	descendingSort (Array &array, const Sorter &sorter=Sorter{})
	Function for sorting elements of array or vector in descending order.

template<typename InputDistributedArray, typename OutputDistributedArray, typename Reduction>
void	distributedExclusiveScan (const InputDistributedArray &input, OutputDistributedArray &output, typename InputDistributedArray::IndexType begin, typename InputDistributedArray::IndexType end, Reduction &&reduction, typename OutputDistributedArray::ValueType identity)
	Computes an exclusive scan (or prefix sum) of a distributed array in-place.

template<typename InputDistributedArray, typename OutputDistributedArray, typename Reduction = TNL::Plus>
void	distributedExclusiveScan (const InputDistributedArray &input, OutputDistributedArray &output, typename InputDistributedArray::IndexType begin=0, typename InputDistributedArray::IndexType end=0, Reduction &&reduction=TNL::Plus{})
	Overload of distributedExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

template<typename InputDistributedArray, typename OutputDistributedArray, typename Reduction>
void	distributedInclusiveScan (const InputDistributedArray &input, OutputDistributedArray &output, typename InputDistributedArray::IndexType begin, typename InputDistributedArray::IndexType end, Reduction &&reduction, typename OutputDistributedArray::ValueType identity)
	Computes an inclusive scan (or prefix sum) of a distributed array in-place.

template<typename InputDistributedArray, typename OutputDistributedArray, typename Reduction = TNL::Plus>
void	distributedInclusiveScan (const InputDistributedArray &input, OutputDistributedArray &output, typename InputDistributedArray::IndexType begin=0, typename InputDistributedArray::IndexType end=0, Reduction &&reduction=TNL::Plus{})
	Overload of distributedInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

template<typename DistributedArray, typename Reduction>
void	distributedInplaceExclusiveScan (DistributedArray &array, typename DistributedArray::IndexType begin, typename DistributedArray::IndexType end, Reduction &&reduction, typename DistributedArray::ValueType identity)
	Computes an exclusive scan (or prefix sum) of a distributed array in-place.

template<typename DistributedArray, typename Reduction = TNL::Plus>
void	distributedInplaceExclusiveScan (DistributedArray &array, typename DistributedArray::IndexType begin=0, typename DistributedArray::IndexType end=0, Reduction &&reduction=TNL::Plus{})
	Overload of distributedInplaceExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

template<typename DistributedArray, typename Reduction>
void	distributedInplaceInclusiveScan (DistributedArray &array, typename DistributedArray::IndexType begin, typename DistributedArray::IndexType end, Reduction &&reduction, typename DistributedArray::ValueType identity)
	Computes an inclusive scan (or prefix sum) of a distributed array in-place.

template<typename DistributedArray, typename Reduction = TNL::Plus>
void	distributedInplaceInclusiveScan (DistributedArray &array, typename DistributedArray::IndexType begin=0, typename DistributedArray::IndexType end=0, Reduction &&reduction=TNL::Plus{})
	Overload of distributedInplaceInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

template<typename DestinationDevice, typename SourceDevice = DestinationDevice, typename DestinationElement, typename SourceElement, typename Index>
bool	equal (DestinationElement destination, const SourceElement source, Index size)
	Compares memory from source with destination.

template<typename InputArray, typename OutputArray, typename Reduction>
void	exclusiveScan (const InputArray &input, OutputArray &output, typename InputArray::IndexType begin, typename InputArray::IndexType end, typename OutputArray::IndexType outputBegin, Reduction &&reduction, typename OutputArray::ValueType identity)
	Computes an exclusive scan (or prefix sum) of an input array and stores it in an output array.

template<typename InputArray, typename OutputArray, typename Reduction = TNL::Plus>
void	exclusiveScan (const InputArray &input, OutputArray &output, typename InputArray::IndexType begin=0, typename InputArray::IndexType end=0, typename OutputArray::IndexType outputBegin=0, Reduction &&reduction=TNL::Plus{})
	Overload of exclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

template<typename Device, typename Element, typename Index>
void	fill (Element *data, const Element &value, Index size)
	Fills memory between `data` and `data + size` with a `value`.

template<typename Device, typename Element, typename Index>
void	fillRandom (Element *data, Index size, Element min_val, Element max_val)
	Fills memory between `data` and `data + size` with random Element values in the given range.

template<typename Container, typename ValueType>
std::pair< bool, typename Container::IndexType >	find (const Container &container, const ValueType &value)
	Find the first occurrence of a value in an array.

template<typename InputArray, typename OutputArray, typename Reduction>
void	inclusiveScan (const InputArray &input, OutputArray &output, typename InputArray::IndexType begin, typename InputArray::IndexType end, typename OutputArray::IndexType outputBegin, Reduction &&reduction, typename OutputArray::ValueType identity)
	Computes an inclusive scan (or prefix sum) of an input array and stores it in an output array.

template<typename InputArray, typename OutputArray, typename Reduction = TNL::Plus>
void	inclusiveScan (const InputArray &input, OutputArray &output, typename InputArray::IndexType begin=0, typename InputArray::IndexType end=0, typename OutputArray::IndexType outputBegin=0, Reduction &&reduction=TNL::Plus{})
	Overload of inclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

template<typename Array, typename Reduction>
void	inplaceExclusiveScan (Array &array, typename Array::IndexType begin, typename Array::IndexType end, Reduction &&reduction, typename Array::ValueType identity)
	Computes an exclusive scan (or prefix sum) of an array in-place.

template<typename Array, typename Reduction = TNL::Plus>
void	inplaceExclusiveScan (Array &array, typename Array::IndexType begin=0, typename Array::IndexType end=0, Reduction &&reduction=TNL::Plus{})
	Overload of inplaceExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

template<typename Array, typename Reduction>
void	inplaceInclusiveScan (Array &array, typename Array::IndexType begin, typename Array::IndexType end, Reduction &&reduction, typename Array::ValueType identity)
	Computes an inclusive scan (or prefix sum) of an array in-place.

template<typename Array, typename Reduction = TNL::Plus>
void	inplaceInclusiveScan (Array &array, typename Array::IndexType begin=0, typename Array::IndexType end=0, Reduction &&reduction=TNL::Plus{})
	Overload of inplaceInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

template<typename Array>
bool	isAscending (const Array &arr)
	Functions returning true if the array elements are sorted in ascending order.

template<typename Array>
bool	isDescending (const Array &arr)
	Functions returning true if the array elements are sorted in descending order.

template<typename Array, typename Compare>
bool	isSorted (const Array &arr, const Compare &compare)
	Functions returning true if the array elements are sorted according to the lmabda function `comparison`.

template<typename Device, typename Begin, typename End, typename Function, typename... FunctionArgs>
std::enable_if_t< std::is_integral_v< Begin > &&std::is_integral_v< End > >	parallelFor (const Begin &begin, const End &end, Function f, FunctionArgs... args)
	Parallel for-loop function for 1D range specified with integral values with default launch configuration.

template<typename Device, typename Begin, typename End, typename Function, typename... FunctionArgs>
std::enable_if_t< IsStaticArrayType< Begin >::value &&IsStaticArrayType< End >::value >	parallelFor (const Begin &begin, const End &end, Function f, FunctionArgs... args)
	Parallel for-loop function for range specified with multi-index values with default launch configuration.

template<typename Device, typename Begin, typename End, typename Function, typename... FunctionArgs>
std::enable_if_t< std::is_integral_v< Begin > &&std::is_integral_v< End > >	parallelFor (const Begin &begin, const End &end, typename Device::LaunchConfiguration launch_config, Function f, FunctionArgs... args)
	Parallel for-loop function for 1D range specified with integral values.

template<typename Device, typename Begin, typename End, typename Function, typename... FunctionArgs>
std::enable_if_t< IsStaticArrayType< Begin >::value &&IsStaticArrayType< End >::value >	parallelFor (const Begin &begin, const End &end, typename Device::LaunchConfiguration launch_config, Function f, FunctionArgs... args)
	Parallel for-loop function for range specified with multi-index values.

template<typename Array, typename Device = typename Array::DeviceType, typename Reduction, typename Result>
auto	reduce (const Array &array, Reduction &&reduction, Result identity)
	Variant of reduce for arrays, views and compatible objects.

template<typename Array, typename Device = typename Array::DeviceType, typename Reduction = TNL::Plus>
auto	reduce (const Array &array, Reduction &&reduction=TNL::Plus{})
	Variant of reduce for arrays, views and compatible objects.

template<typename Device, typename Index, typename Result, typename Fetch, typename Reduction>
Result	reduce (Index begin, Index end, Fetch &&fetch, Reduction &&reduction, const Result &identity)
	reduce implements (parallel) reduction for vectors and arrays.

template<typename Device, typename Index, typename Fetch, typename Reduction = TNL::Plus>
auto	reduce (Index begin, Index end, Fetch &&fetch, Reduction &&reduction=TNL::Plus{})
	Variant of reduce with a reduction function object instead of lambda function.

template<typename Array, typename Device = typename Array::DeviceType, typename Reduction>
auto	reduceWithArgument (const Array &array, Reduction &&reduction)
	Variant of reduceWithArgument for arrays, views and compatible objects.

template<typename Array, typename Device = typename Array::DeviceType, typename Reduction, typename Result>
auto	reduceWithArgument (const Array &array, Reduction &&reduction, Result identity)
	Variant of reduceWithArgument for arrays, views and compatible objects.

template<typename Device, typename Index, typename Fetch, typename Reduction>
auto	reduceWithArgument (Index begin, Index end, Fetch &&fetch, Reduction &&reduction)
	Variant of reduceWithArgument with reduction function object instead of lambda function.

template<typename Device, typename Index, typename Result, typename Fetch, typename Reduction>
std::pair< Result, Index >	reduceWithArgument (Index begin, Index end, Fetch &&fetch, Reduction &&reduction, const Result &identity)
	Variant of reduce returning also the position of the element of interest.

template<typename Array, typename Compare, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>
void	sort (Array &array, const Compare &compare, const Sorter &sorter=Sorter{})
	Function for sorting elements of array or vector based on a user defined comparison lambda function.

template<typename Device, typename Index, typename Compare, typename Swap, typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType>
void	sort (const Index begin, const Index end, Compare &&compare, Swap &&swap, const Sorter &sorter=Sorter{})
	Function for general sorting based on lambda functions for comparison and swaping of two elements..

template<typename Index, Index begin, Index end, typename Func, typename... ArgTypes>
constexpr void	staticFor (Func &&f, ArgTypes &&... args)
	Generic loop with constant bounds and indices usable in constant expressions.

template<typename Index, Index begin, Index end, Index unrollFactor = 8, typename Func>
constexpr void	unrolledFor (Func &&f)
	Generic for-loop with explicit unrolling.

Detailed Description

Namespace for fundamental TNL algorithms.

It contains algorithms like for-loops, memory operations, (parallel) reduction, multireduction, scan etc.

Function Documentation

◆ ascendingSort()

template<typename Array, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>

void TNL::Algorithms::ascendingSort	(	Array &	array,
		const Sorter &	sorter = Sorter{} )

Function for sorting elements of array or vector in ascending order.

Template Parameters

Array	is a type of container to be sorted. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Sorter	is an algorithm for sorting. It can be TNL::Algorithms::Sorting::STLSort for sorting on host and TNL::Algorithms::Sorting::Quicksort or TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU.

Parameters

array	is an instance of array/array view/vector/vector view for sorting.
sorter	is an instance of sorter.

Example

#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/sort.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
template< typename ArrayT >
void
sort( ArrayT& array )
{
   const int size = 10;
 
   /****
    * Fill the array with random integers.
    */
   Array< int > aux_array( size );
   srand( size + 2021 );
   parallelFor< Devices::Host >( 0,
                                 size,
                                 [ & ]( int i )
                                 {
                                    aux_array[ i ] = std::rand() % ( 2 * size );
                                 } );
   array = aux_array;
 
   std::cout << "Random array: " << array << std::endl;
 
   /****
    * Sort the array in ascending order.
    */
   ascendingSort( array );
   std::cout << "Array sorted in ascending order:" << array << std::endl;
 
   /***
    * Sort the array in descending order.
    */
   descendingSort( array );
   std::cout << "Array sorted in descending order:" << array << std::endl;
}
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the sorting on CPU.
    */
   std::cout << "Sorting on CPU ... " << std::endl;
   Array< int, Devices::Host > host_array;
   sort( host_array );
 
#ifdef __CUDACC__
   /***
    * And then also on GPU.
    */
   std::cout << "Sorting on GPU ... " << std::endl;
   Array< int, Devices::Cuda > cuda_array;
   sort( cuda_array );
#endif
   return EXIT_SUCCESS;
}

Output

Sorting on CPU ... 
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]
Sorting on GPU ... 
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]

◆ contains()

template<typename Array>

bool TNL::Algorithms::contains	(	const Array &	array,
		typename Array::ValueType	value,
		typename Array::IndexType	begin = 0,
		typename Array::IndexType	end = 0 )

Checks if an array/vector/view contains an element with given value.

By default, all elements of the array are checked. If begin or end is set to a non-zero value, only elements in the sub-interval [begin, end) are checked.

Parameters

array	The array to be searched.
value	The value to be checked.
begin	The beginning of the array sub-interval. It is 0 by default.
end	The end of the array sub-interval. The default value is 0 which is, however, replaced with the array size.

Returns: true if there is at least one element in the sub-interval [begin, end) which has the value value. Returns false if the range is empty.

◆ containsOnlyValue()

template<typename Array>

bool TNL::Algorithms::containsOnlyValue	(	const Array &	array,
		typename Array::ValueType	value,
		typename Array::IndexType	begin = 0,
		typename Array::IndexType	end = 0 )

Checks if all elements of an array/vector/view have the given value.

By default, all elements of the array are checked. If begin or end is set to a non-zero value, only elements in the sub-interval [begin, end) are checked.

Parameters

array	The array to be searched.
value	The value to be checked.
begin	The beginning of the array sub-interval. It is 0 by default.
end	The end of the array sub-interval. The default value is 0 which is, however, replaced with the array size.

Returns: true if all elements in the sub-interval [begin, end) have the same value value. Returns true if the range is empty.

◆ copy() [1/3]

template<typename DestinationDevice, typename SourceDevice = DestinationDevice, typename DestinationElement, typename SourceElement, typename Index>

void TNL::Algorithms::copy	(	DestinationElement *	destination,
		const SourceElement *	source,
		Index	size )

Copies memory from source to destination.

The source data is allocated on the device specified by SourceDevice and the destination data is allocated on the device specified by DestinationDevice.

Template Parameters

DestinationDevice	is the device where the destination data is allocated.
SourceDevice	is the device where the source data is allocated.
DestinationElement	is the type of the destination data.
SourceElement	is the type of the source data.
Index	is the type of the size of the data.

Parameters

destination	is the pointer to the destination data.
source	is the pointer to the source data.
size	is the size of the data.

◆ copy() [2/3]

template<typename DestinationDevice, typename DestinationElement, typename Index, typename SourceIterator>

void TNL::Algorithms::copy	(	DestinationElement *	destination,
		Index	destinationSize,
		SourceIterator	begin,
		SourceIterator	end )

Copies memory from source iterator range to destination.

The source data must be allocated on the host device. The destination data is allocated on the device specified by DestinationDevice.

Template Parameters

DestinationDevice	is the device where the destination data is allocated.
DestinationElement	is the type of the destination data.
Index	is the type of the size of the data.
SourceIterator	is the iterator type for the source data.

Parameters

destination	is the pointer to the destination data.
destinationSize	is the size of the destination data.
begin	is the iterator to the first element of the source data range.
end	is the one-past-the-end iterator of the source data range.

◆ copy() [3/3]

template<typename Array, typename DestinationElement, typename = std::enable_if_t< IsArrayType< Array >::value >>

void TNL::Algorithms::copy	(	std::vector< DestinationElement > &	destination,
		const Array &	source )

Copies memory from the source TNL array-like container to the destination STL vector.

Template Parameters

Array	is the type of array where the source data is stored. It can be for example TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector or TNL::Containers::VectorView.
DestinationElement	is the type of the destination data stored in the STL vector.

Parameters

destination	is the destination STL vector.
source	is the source TNL array.

◆ descendingSort()

template<typename Array, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>

void TNL::Algorithms::descendingSort	(	Array &	array,
		const Sorter &	sorter = Sorter{} )

Function for sorting elements of array or vector in descending order.

Template Parameters

Array	is a type of container to be sorted. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Sorter	is an algorithm for sorting. It can be TNL::Algorithms::Sorting::STLSort for sorting on host and TNL::Algorithms::Sorting::Quicksort or TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU.

Parameters

array	is an instance of array/array view/vector/vector view for sorting.
sorter	is an instance of sorter.

Example

#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/sort.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
template< typename ArrayT >
void
sort( ArrayT& array )
{
   const int size = 10;
 
   /****
    * Fill the array with random integers.
    */
   Array< int > aux_array( size );
   srand( size + 2021 );
   parallelFor< Devices::Host >( 0,
                                 size,
                                 [ & ]( int i )
                                 {
                                    aux_array[ i ] = std::rand() % ( 2 * size );
                                 } );
   array = aux_array;
 
   std::cout << "Random array: " << array << std::endl;
 
   /****
    * Sort the array in ascending order.
    */
   ascendingSort( array );
   std::cout << "Array sorted in ascending order:" << array << std::endl;
 
   /***
    * Sort the array in descending order.
    */
   descendingSort( array );
   std::cout << "Array sorted in descending order:" << array << std::endl;
}
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the sorting on CPU.
    */
   std::cout << "Sorting on CPU ... " << std::endl;
   Array< int, Devices::Host > host_array;
   sort( host_array );
 
#ifdef __CUDACC__
   /***
    * And then also on GPU.
    */
   std::cout << "Sorting on GPU ... " << std::endl;
   Array< int, Devices::Cuda > cuda_array;
   sort( cuda_array );
#endif
   return EXIT_SUCCESS;
}

Output

Sorting on CPU ... 
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]
Sorting on GPU ... 
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]

◆ distributedExclusiveScan() [1/2]

template<typename InputDistributedArray, typename OutputDistributedArray, typename Reduction>

void TNL::Algorithms::distributedExclusiveScan	(	const InputDistributedArray &	input,
		OutputDistributedArray &	output,
		typename InputDistributedArray::IndexType	begin,
		typename InputDistributedArray::IndexType	end,
		Reduction &&	reduction,
		typename OutputDistributedArray::ValueType	identity )

Computes an exclusive scan (or prefix sum) of a distributed array in-place.

Exclusive scan (or prefix sum) operation turns a sequence $a_1, \ldots, a_n$ into a sequence $\sigma_1, \ldots, \sigma_n$ defined as

$\sigma_i = \sum_{j=1}^{i-1} a_i.$

Template Parameters

DistributedArray	type of the distributed array to be scanned
Reduction	type of the reduction functor

Parameters

input	input array
output	output array
begin	the first element in the array to be scanned
end	the last element in the array to be scanned
reduction	functor implementing the reduction operation
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

__cuda_callable__

#define __cuda_callable__

Definition Macros.h:49

◆ distributedExclusiveScan() [2/2]

template<typename InputDistributedArray, typename OutputDistributedArray, typename Reduction = TNL::Plus>

void TNL::Algorithms::distributedExclusiveScan	(	const InputDistributedArray &	input,
		OutputDistributedArray &	output,
		typename InputDistributedArray::IndexType	begin = 0,
		typename InputDistributedArray::IndexType	end = 0,
		Reduction &&	reduction = TNL::Plus{} )

Overload of distributedExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename OutputDistributedArray::ValueType >(). See distributedExclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to input.getSize().

◆ distributedInclusiveScan() [1/2]

template<typename InputDistributedArray, typename OutputDistributedArray, typename Reduction>

void TNL::Algorithms::distributedInclusiveScan	(	const InputDistributedArray &	input,
		OutputDistributedArray &	output,
		typename InputDistributedArray::IndexType	begin,
		typename InputDistributedArray::IndexType	end,
		Reduction &&	reduction,
		typename OutputDistributedArray::ValueType	identity )

Computes an inclusive scan (or prefix sum) of a distributed array in-place.

Inclusive scan (or prefix sum) operation turns a sequence $a_1, \ldots, a_n$ into a sequence $s_1, \ldots, s_n$ defined as

$s_i = \sum_{j=1}^i a_i.$

Template Parameters

DistributedArray	type of the distributed array to be scanned
Reduction	type of the reduction functor

Parameters

input	input array
output	output array
begin	the first element in the array to be scanned
end	the last element in the array to be scanned
reduction	functor implementing the reduction operation
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

◆ distributedInclusiveScan() [2/2]

template<typename InputDistributedArray, typename OutputDistributedArray, typename Reduction = TNL::Plus>

void TNL::Algorithms::distributedInclusiveScan	(	const InputDistributedArray &	input,
		OutputDistributedArray &	output,
		typename InputDistributedArray::IndexType	begin = 0,
		typename InputDistributedArray::IndexType	end = 0,
		Reduction &&	reduction = TNL::Plus{} )

Overload of distributedInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename OutputDistributedArray::ValueType >(). See distributedInclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to input.getSize().

◆ distributedInplaceExclusiveScan() [1/2]

template<typename DistributedArray, typename Reduction>

void TNL::Algorithms::distributedInplaceExclusiveScan	(	DistributedArray &	array,
		typename DistributedArray::IndexType	begin,
		typename DistributedArray::IndexType	end,
		Reduction &&	reduction,
		typename DistributedArray::ValueType	identity )

Computes an exclusive scan (or prefix sum) of a distributed array in-place.

Exclusive scan (or prefix sum) operation turns a sequence $a_1, \ldots, a_n$ into a sequence $\sigma_1, \ldots, \sigma_n$ defined as

$\sigma_i = \sum_{j=1}^{i-1} a_i.$

Template Parameters

DistributedArray	type of the distributed array to be scanned
Reduction	type of the reduction functor

Parameters

array	input array, the result of scan is stored in the same array
begin	the first element in the array to be scanned
end	the last element in the array to be scanned
reduction	functor implementing the reduction operation
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

◆ distributedInplaceExclusiveScan() [2/2]

template<typename DistributedArray, typename Reduction = TNL::Plus>

void TNL::Algorithms::distributedInplaceExclusiveScan	(	DistributedArray &	array,
		typename DistributedArray::IndexType	begin = 0,
		typename DistributedArray::IndexType	end = 0,
		Reduction &&	reduction = TNL::Plus{} )

Overload of distributedInplaceExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename DistributedArray::ValueType >(). See distributedInplaceExclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to array.getSize().

◆ distributedInplaceInclusiveScan() [1/2]

template<typename DistributedArray, typename Reduction>

void TNL::Algorithms::distributedInplaceInclusiveScan	(	DistributedArray &	array,
		typename DistributedArray::IndexType	begin,
		typename DistributedArray::IndexType	end,
		Reduction &&	reduction,
		typename DistributedArray::ValueType	identity )

Computes an inclusive scan (or prefix sum) of a distributed array in-place.

Inclusive scan (or prefix sum) operation turns a sequence $a_1, \ldots, a_n$ into a sequence $s_1, \ldots, s_n$ defined as

$s_i = \sum_{j=1}^i a_i.$

Template Parameters

DistributedArray	type of the distributed array to be scanned
Reduction	type of the reduction functor

Parameters

array	input array, the result of scan is stored in the same array
begin	the first element in the array to be scanned
end	the last element in the array to be scanned
reduction	functor implementing the reduction operation
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

◆ distributedInplaceInclusiveScan() [2/2]

template<typename DistributedArray, typename Reduction = TNL::Plus>

void TNL::Algorithms::distributedInplaceInclusiveScan	(	DistributedArray &	array,
		typename DistributedArray::IndexType	begin = 0,
		typename DistributedArray::IndexType	end = 0,
		Reduction &&	reduction = TNL::Plus{} )

Overload of distributedInplaceInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename DistributedArray::ValueType >(). See distributedInplaceInclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to array.getSize().

◆ equal()

template<typename DestinationDevice, typename SourceDevice = DestinationDevice, typename DestinationElement, typename SourceElement, typename Index>

bool TNL::Algorithms::equal	(	DestinationElement *	destination,
		const SourceElement *	source,
		Index	size )

Compares memory from source with destination.

The source data is allocated on the device specified by SourceDevice and the destination data is allocated on the device specified by DestinationDevice.

Template Parameters

DestinationDevice	is the device where the destination data is allocated.
SourceDevice	is the device where the source data is allocated.
DestinationElement	is the type of the destination data.
SourceElement	is the type of the source data.
Index	is the type of the size of the data.

Parameters

destination	is the pointer to the destination data.
source	is the pointer to the source data.
size	is the size of the data.

Returns: true if all elements are equal, false otherwise.

◆ exclusiveScan() [1/2]

template<typename InputArray, typename OutputArray, typename Reduction>

void TNL::Algorithms::exclusiveScan	(	const InputArray &	input,
		OutputArray &	output,
		typename InputArray::IndexType	begin,
		typename InputArray::IndexType	end,
		typename OutputArray::IndexType	outputBegin,
		Reduction &&	reduction,
		typename OutputArray::ValueType	identity )

Computes an exclusive scan (or prefix sum) of an input array and stores it in an output array.

Exclusive scan (or prefix sum) operation turns a sequence $a_1, \ldots, a_n$ into a sequence $\sigma_1, \ldots, \sigma_n$ defined as

$\sigma_i = \sum_{j=1}^{i-1} a_i.$

Template Parameters

InputArray	type of the array to be scanned
OutputArray	type of the output array
Reduction	type of the reduction functor

Parameters

input	the input array to be scanned
output	the array where the result will be stored
begin	the first element in the array to be scanned
end	the last element in the array to be scanned
outputBegin	the first element in the output array to be written. There must be at least `end - begin` elements in the output array starting at the position given by `outputBegin`.
reduction	functor implementing the reduction operation
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

Example

#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/scan.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the prefix sum with an array allocated on CPU.
    */
   Array< double, Devices::Host > host_input( 10 ), host_output( 10 );
   host_input = 1.0;
   std::cout << "host_input = " << host_input << std::endl;
   exclusiveScan( host_input, host_output );
   std::cout << "host_output " << host_output << std::endl;
 
   /***
    * And then also on GPU.
    */
#ifdef __CUDACC__
   Array< double, Devices::Cuda > cuda_input( 10 ), cuda_output( 10 );
   cuda_input = 1.0;
   std::cout << "cuda_input = " << cuda_input << std::endl;
   exclusiveScan( cuda_input, cuda_output );
   std::cout << "cuda_output " << cuda_output << std::endl;
#endif
   return EXIT_SUCCESS;
}

Output

host_input = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

host_output [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]

cuda_input = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

cuda_output [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]

◆ exclusiveScan() [2/2]

template<typename InputArray, typename OutputArray, typename Reduction = TNL::Plus>

void TNL::Algorithms::exclusiveScan	(	const InputArray &	input,
		OutputArray &	output,
		typename InputArray::IndexType	begin = 0,
		typename InputArray::IndexType	end = 0,
		typename OutputArray::IndexType	outputBegin = 0,
		Reduction &&	reduction = TNL::Plus{} )

Overload of exclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename OutputArray::ValueType >(). See exclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to input.getSize().

◆ fill()

template<typename Device, typename Element, typename Index>

void TNL::Algorithms::fill	(	Element *	data,
		const Element &	value,
		Index	size )

Fills memory between data and data + size with a value.

Template Parameters

Device	is the device where the data is allocated.
Element	is the type of the data.
Index	is the type of the size of the data.

Parameters

data	is the pointer to the memory where the value will be set.
value	is the value to be filled.
size	is the size of the data.

◆ fillRandom()

template<typename Device, typename Element, typename Index>

void TNL::Algorithms::fillRandom	(	Element *	data,
		Index	size,
		Element	min_val,
		Element	max_val )

Fills memory between data and data + size with random Element values in the given range.

Template Parameters

Device	is the device where the data is allocated.
Element	is the type of the data.
Index	is the type of the size of the data.

Parameters

data	is the pointer to the memory where the random values will be set.
size	is the size of the data.
min_val	is the minimum random value
max_val	is the maximum random value

◆ find()

template<typename Container, typename ValueType>

std::pair< bool, typename Container::IndexType > TNL::Algorithms::find	(	const Container &	container,
		const ValueType &	value )

Find the first occurrence of a value in an array.

Template Parameters

Container	is the type of the container.
ValueType	is the type of the value to be found.
IndexType	is the type used for indexing.

Parameters

container	is the array where the value is searched.
value	is the value to be found.

Returns: pair (found, position) where found is a boolean indicating if the value was found and position is the position of the first occurrence in the container.

◆ inclusiveScan() [1/2]

template<typename InputArray, typename OutputArray, typename Reduction>

void TNL::Algorithms::inclusiveScan	(	const InputArray &	input,
		OutputArray &	output,
		typename InputArray::IndexType	begin,
		typename InputArray::IndexType	end,
		typename OutputArray::IndexType	outputBegin,
		Reduction &&	reduction,
		typename OutputArray::ValueType	identity )

Computes an inclusive scan (or prefix sum) of an input array and stores it in an output array.

Inclusive scan (or prefix sum) operation turns a sequence $a_1, \ldots, a_n$ into a sequence $s_1, \ldots, s_n$ defined as

$s_i = \sum_{j=1}^i a_i.$

Template Parameters

InputArray	type of the array to be scanned
OutputArray	type of the output array
Reduction	type of the reduction functor

Parameters

input	the input array to be scanned
output	the array where the result will be stored
begin	the first element in the array to be scanned
end	the last element in the array to be scanned
outputBegin	the first element in the output array to be written. There must be at least `end - begin` elements in the output array starting at the position given by `outputBegin`.
reduction	functor implementing the reduction operation
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

Example

#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/scan.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the prefix sum with an array allocated on CPU.
    */
   Array< double, Devices::Host > host_input( 10 ), host_output( 10 );
   host_input = 1.0;
   std::cout << "host_input = " << host_input << std::endl;
   inclusiveScan( host_input, host_output );
   std::cout << "host_output " << host_output << std::endl;
 
   /***
    * And then also on GPU.
    */
#ifdef __CUDACC__
   Array< double, Devices::Cuda > cuda_input( 10 ), cuda_output( 10 );
   cuda_input = 1.0;
   std::cout << "cuda_input = " << cuda_input << std::endl;
   inclusiveScan( cuda_input, cuda_output );
   std::cout << "cuda_output " << cuda_output << std::endl;
#endif
   return EXIT_SUCCESS;
}

Output

host_input = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

host_output [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ]

cuda_input = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

cuda_output [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ]

◆ inclusiveScan() [2/2]

template<typename InputArray, typename OutputArray, typename Reduction = TNL::Plus>

void TNL::Algorithms::inclusiveScan	(	const InputArray &	input,
		OutputArray &	output,
		typename InputArray::IndexType	begin = 0,
		typename InputArray::IndexType	end = 0,
		typename OutputArray::IndexType	outputBegin = 0,
		Reduction &&	reduction = TNL::Plus{} )

Overload of inclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename OutputArray::ValueType >(). See inclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to input.getSize().

◆ inplaceExclusiveScan() [1/2]

template<typename Array, typename Reduction>

void TNL::Algorithms::inplaceExclusiveScan	(	Array &	array,
		typename Array::IndexType	begin,
		typename Array::IndexType	end,
		Reduction &&	reduction,
		typename Array::ValueType	identity )

Computes an exclusive scan (or prefix sum) of an array in-place.

Exclusive scan (or prefix sum) operation turns a sequence $a_1, \ldots, a_n$ into a sequence $\sigma_1, \ldots, \sigma_n$ defined as

$\sigma_i = \sum_{j=1}^{i-1} a_i.$

Template Parameters

Array	type of the array to be scanned
Reduction	type of the reduction functor

Parameters

array	input array, the result of scan is stored in the same array
begin	the first element in the array to be scanned
end	the last element in the array to be scanned
reduction	functor implementing the reduction operation
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

Example

#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/scan.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the prefix sum with an array allocated on CPU.
    */
   Array< double, Devices::Host > host_a( 10 );
   host_a = 1.0;
   std::cout << "host_a = " << host_a << std::endl;
   inplaceExclusiveScan( host_a );
   std::cout << "The prefix sum of the host array is " << host_a << "." << std::endl;
 
   /***
    * And then also on GPU.
    */
#ifdef __CUDACC__
   Array< double, Devices::Cuda > cuda_a( 10 );
   cuda_a = 1.0;
   std::cout << "cuda_a = " << cuda_a << std::endl;
   inplaceExclusiveScan( cuda_a );
   std::cout << "The prefix sum of the CUDA array is " << cuda_a << "." << std::endl;
#endif
   return EXIT_SUCCESS;
}

Output

host_a = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

The prefix sum of the host array is [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ].

cuda_a = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

The prefix sum of the CUDA array is [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ].

◆ inplaceExclusiveScan() [2/2]

template<typename Array, typename Reduction = TNL::Plus>

void TNL::Algorithms::inplaceExclusiveScan	(	Array &	array,
		typename Array::IndexType	begin = 0,
		typename Array::IndexType	end = 0,
		Reduction &&	reduction = TNL::Plus{} )

Overload of inplaceExclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename Array::ValueType >(). See inplaceExclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to array.getSize().

◆ inplaceInclusiveScan() [1/2]

template<typename Array, typename Reduction>

void TNL::Algorithms::inplaceInclusiveScan	(	Array &	array,
		typename Array::IndexType	begin,
		typename Array::IndexType	end,
		Reduction &&	reduction,
		typename Array::ValueType	identity )

Computes an inclusive scan (or prefix sum) of an array in-place.

Inclusive scan (or prefix sum) operation turns a sequence $a_1, \ldots, a_n$ into a sequence $s_1, \ldots, s_n$ defined as

$s_i = \sum_{j=1}^i a_i.$

Template Parameters

Array	type of the array to be scanned
Reduction	type of the reduction functor

Parameters

array	input array, the result of scan is stored in the same array
begin	the first element in the array to be scanned
end	the last element in the array to be scanned
reduction	functor implementing the reduction operation
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

The reduction functor takes two variables to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

Example

#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/scan.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the prefix sum with an array allocated on CPU.
    */
   Array< double, Devices::Host > host_a( 10 );
   host_a = 1.0;
   std::cout << "host_a = " << host_a << std::endl;
   inplaceInclusiveScan( host_a );
   std::cout << "The prefix sum of the host array is " << host_a << "." << std::endl;
 
   /***
    * And then also on GPU.
    */
#ifdef __CUDACC__
   Array< double, Devices::Cuda > cuda_a( 10 );
   cuda_a = 1.0;
   std::cout << "cuda_a = " << cuda_a << std::endl;
   inplaceInclusiveScan( cuda_a );
   std::cout << "The prefix sum of the CUDA array is " << cuda_a << "." << std::endl;
#endif
   return EXIT_SUCCESS;
}

Output

host_a = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

The prefix sum of the host array is [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ].

cuda_a = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

The prefix sum of the CUDA array is [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ].

◆ inplaceInclusiveScan() [2/2]

template<typename Array, typename Reduction = TNL::Plus>

void TNL::Algorithms::inplaceInclusiveScan	(	Array &	array,
		typename Array::IndexType	begin = 0,
		typename Array::IndexType	end = 0,
		Reduction &&	reduction = TNL::Plus{} )

Overload of inplaceInclusiveScan which uses a TNL functional object for reduction. TNL::Plus is used by default.

The identity element is taken as reduction.template getIdentity< typename Array::ValueType >(). See inplaceInclusiveScan for the explanation of other parameters. Note that when end equals 0 (the default), it is set to array.getSize().

◆ isAscending()

template<typename Array>

bool TNL::Algorithms::isAscending ( const Array & arr )

Functions returning true if the array elements are sorted in ascending order.

Template Parameters

Array is the type of array/vector. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.

Parameters

arr	is an instance of tested array.

Returns: true if the array is sorted in ascending order.; false if the array is NOT sorted in ascending order.

◆ isDescending()

template<typename Array>

bool TNL::Algorithms::isDescending ( const Array & arr )

Functions returning true if the array elements are sorted in descending order.

Template Parameters

Array is the type of array/vector. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.

Parameters

arr	is an instance of tested array.

Returns: true if the array is sorted in descending order.; false if the array is NOT sorted in descending order.

◆ isSorted()

template<typename Array, typename Compare>

bool TNL::Algorithms::isSorted	(	const Array &	arr,
		const Compare &	compare )

Functions returning true if the array elements are sorted according to the lmabda function comparison.

Template Parameters

Array	is the type of array/vector. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Compare	is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given by indices representing their positions. The lambda function is supposed to be defined as follows: auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... };

Parameters

arr	is an instance of tested array.
compare	is an instance of the lambda function for elements comparison.

Returns: true if the array is sorted in ascending order.; false if the array is NOT sorted in ascending order.

◆ parallelFor() [1/2]

template<typename Device, typename Begin, typename End, typename Function, typename... FunctionArgs>

std::enable_if_t< std::is_integral_v< Begin > &&std::is_integral_v< End > > TNL::Algorithms::parallelFor	(	const Begin &	begin,
		const End &	end,
		typename Device::LaunchConfiguration	launch_config,
		Function	f,
		FunctionArgs...	args )

Parallel for-loop function for 1D range specified with integral values.

Template Parameters

Device	is a type of the device where the reduction will be performed.
Begin	must be an integral type.
End	must be an integral type.

Parameters

begin	is the left bound of the iteration range `[begin, end)`.
end	is the right bound of the iteration range `[begin, end)`.
f	is the function to be called in each iteration. Arguments of the function are the iteration index and arguments from the `args...` variadic pack.
launch_config	specifies kernel launch parameters.
args	are additional parameters to be passed to the function f.

Example: #include <iostream>

#include <TNL/Containers/Vector.h>

#include <TNL/Algorithms/parallelFor.h>

using namespace TNL;

using namespace TNL::Containers;

using namespace TNL::Algorithms;

/****

* Set all elements of the vector v to the constant c.

*/

template< typename Device >

void

initVector( Vector< double, Device >& v, const double& c )

{

auto view = v.getView();

auto init = [ = ] __cuda_callable__( int i ) mutable

{

view[ i ] = c;

};

parallelFor< Device >( 0, v.getSize(), init );

}

int

main( int argc, char* argv[] )

{

/***

* Firstly, test the vector initiation on CPU.

*/

Vector< double, Devices::Host > host_v( 10 );

initVector( host_v, 1.0 );

std::cout << "host_v = " << host_v << std::endl;

/***

* And then also on GPU.

*/

#ifdef __CUDACC__

Vector< double, Devices::Cuda > cuda_v( 10 );

initVector( cuda_v, 1.0 );

std::cout << "cuda_v = " << cuda_v << std::endl;

#endif

return EXIT_SUCCESS;

}

TNL::Containers::Array::getSize
__cuda_callable__ IndexType getSize() const
Returns the current array size.
Definition Array.hpp:249

TNL::Containers::Vector
Vector extends Array with algebraic operations.
Definition Vector.h:36

TNL::Containers::Vector::getView
ViewType getView(IndexType begin=0, IndexType end=0)
Returns a modifiable view of the vector.
Definition Vector.hpp:25

Output: host_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

cuda_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

◆ parallelFor() [2/2]

template<typename Device, typename Begin, typename End, typename Function, typename... FunctionArgs>

std::enable_if_t< IsStaticArrayType< Begin >::value &&IsStaticArrayType< End >::value > TNL::Algorithms::parallelFor	(	const Begin &	begin,
		const End &	end,
		typename Device::LaunchConfiguration	launch_config,
		Function	f,
		FunctionArgs...	args )

Parallel for-loop function for range specified with multi-index values.

Template Parameters

Device	is a type of the device where the reduction will be performed.
Begin	must satisfy the constraints checked by the TNL::IsStaticArrayType type trait.
End	must satisfy the constraints checked by the TNL::IsStaticArrayType type trait.

Parameters

begin	is the left bound of the iteration range `[begin, end)`.
end	is the right bound of the iteration range `[begin, end)`.
f	is the function to be called in each iteration. Arguments of the function are the iteration multi-index, which is an instance of the `End` type, and arguments from the `args...` variadic pack.
launch_config	specifies kernel launch parameters.
args	are additional parameters to be passed to the function f.

Example: #include <iostream>

#include <TNL/Containers/Vector.h>

#include <TNL/Containers/StaticArray.h>

#include <TNL/Algorithms/parallelFor.h>

using namespace TNL;

using namespace TNL::Containers;

using namespace TNL::Algorithms;

template< typename Device >

void

initMeshFunction( const int xSize, const int ySize, const int zSize, Vector< double, Device >& v, const double& c )

{

auto view = v.getView();

auto init = [ = ] __cuda_callable__( const StaticArray< 3, int >& i ) mutable

{

view[ ( i.z() * ySize + i.y() ) * xSize + i.x() ] = c;

};

StaticArray< 3, int > begin{ 0, 0, 0 };

StaticArray< 3, int > end{ xSize, ySize, zSize };

parallelFor< Device >( begin, end, init );

}

int

main( int argc, char* argv[] )

{

/***

* Define dimensions of a 3D mesh function.

*/

const int xSize( 10 ), ySize( 10 ), zSize( 10 );

const int size = xSize * ySize * zSize;

/***

* Firstly, test the mesh function initiation on CPU.

*/

Vector< double, Devices::Host > host_v( size );

initMeshFunction( xSize, ySize, zSize, host_v, 1.0 );

/***

* And then also on GPU.

*/

#ifdef __CUDACC__

Vector< double, Devices::Cuda > cuda_v( size );

initMeshFunction( xSize, ySize, zSize, cuda_v, 1.0 );

#endif

return EXIT_SUCCESS;

}

TNL::Containers::StaticArray
Array with constant size.
Definition StaticArray.h:20

Output

◆ reduce() [1/4]

template<typename Array, typename Device = typename Array::DeviceType, typename Reduction, typename Result>

auto TNL::Algorithms::reduce	(	const Array &	array,
		Reduction &&	reduction,
		Result	identity )

Variant of reduce for arrays, views and compatible objects.

The referenced reduce function is called with:

Device, which is typename Array::DeviceType by default, as the Device type,
0 as the beginning of the interval for reduction,
array.getSize() as the end of the interval for reduction,
array.getConstView() as the fetch functor,
reduction as the reduction operation,
and identity as the identity element of the reduction.

Example

#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/reduce.h>
 
using namespace TNL;
 
template< typename Device >
void
reduceArrayExample()
{
   /****
    * Create new arrays
    */
   const int size = 10;
   Containers::Array< float, Device > a( size );
 
   /****
    * Initiate the elements of array `a`
    */
   a.forAllElements(
      [] __cuda_callable__( int i, float& value )
      {
         value = i;
      } );
 
   /****
    * Sum all elements of array `a`
    */
   float sum_total = Algorithms::reduce( a, TNL::Plus{} );
 
   /****
    * Sum last 5 elements of array `a`
    */
   float sum_last_five = Algorithms::reduce( a.getConstView( 5, 10 ), TNL::Plus{} );
 
   /****
    * Print the results
    */
   std::cout << " a = " << a << std::endl;
   std::cout << " sum of all elements = " << sum_total << std::endl;
   std::cout << " sum of last 5 elements = " << sum_last_five << std::endl;
}
 
int
main( int argc, char* argv[] )
{
   std::cout << "Running example on the host system: " << std::endl;
   reduceArrayExample< Devices::Host >();
 
#ifdef __CUDACC__
   std::cout << "Running example on the CUDA device: " << std::endl;
   reduceArrayExample< Devices::Cuda >();
#endif
}

Output

Running example on the host system: 
 a = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
 sum of all elements = 45
 sum of last 5 elements = 35
Running example on the CUDA device: 
 a = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
 sum of all elements = 45
 sum of last 5 elements = 35

◆ reduce() [2/4]

template<typename Array, typename Device = typename Array::DeviceType, typename Reduction = TNL::Plus>

auto TNL::Algorithms::reduce	(	const Array &	array,
		Reduction &&	reduction = TNL::Plus{} )

Variant of reduce for arrays, views and compatible objects.

The referenced reduce function is called with:

Device, which is typename Array::DeviceType by default, as the Device type,
0 as the beginning of the interval for reduction,
array.getSize() as the end of the interval for reduction,
array.getConstView() as the fetch functor,
reduction as the reduction operation, it must be an instance of some Function objects for reduction operations
and the identity element obtained from the reduction function object.

Example

#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/reduce.h>
 
using namespace TNL;
 
template< typename Device >
void
reduceArrayExample()
{
   /****
    * Create new arrays
    */
   const int size = 10;
   Containers::Array< float, Device > a( size );
 
   /****
    * Initiate the elements of array `a`
    */
   a.forAllElements(
      [] __cuda_callable__( int i, float& value )
      {
         value = i;
      } );
 
   /****
    * Sum all elements of array `a`
    */
   float sum_total = Algorithms::reduce( a, TNL::Plus{} );
 
   /****
    * Sum last 5 elements of array `a`
    */
   float sum_last_five = Algorithms::reduce( a.getConstView( 5, 10 ), TNL::Plus{} );
 
   /****
    * Print the results
    */
   std::cout << " a = " << a << std::endl;
   std::cout << " sum of all elements = " << sum_total << std::endl;
   std::cout << " sum of last 5 elements = " << sum_last_five << std::endl;
}
 
int
main( int argc, char* argv[] )
{
   std::cout << "Running example on the host system: " << std::endl;
   reduceArrayExample< Devices::Host >();
 
#ifdef __CUDACC__
   std::cout << "Running example on the CUDA device: " << std::endl;
   reduceArrayExample< Devices::Cuda >();
#endif
}

Output

Running example on the host system: 
 a = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
 sum of all elements = 45
 sum of last 5 elements = 35
Running example on the CUDA device: 
 a = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
 sum of all elements = 45
 sum of last 5 elements = 35

◆ reduce() [3/4]

template<typename Device, typename Index, typename Result, typename Fetch, typename Reduction>

Result TNL::Algorithms::reduce	(	Index	begin,
		Index	end,
		Fetch &&	fetch,
		Reduction &&	reduction,
		const Result &	identity )

reduce implements (parallel) reduction for vectors and arrays.

Reduction can be used for operations having one or more vectors (or arrays) elements as input and returning one number (or element) as output. Some examples of such operations can be vectors/arrays comparison, vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even the position of the smallest or the largest element, the function reduceWithArgument can be used.

Template Parameters

Device	is a type of the device where the reduction will be performed. It can be on of the following: TNL::Devices::Sequential, TNL::Devices::Host, TNL::Devices::Cuda.
Index	is a type for indexing.
Result	is a type of the reduction result.
Fetch	is a lambda function for fetching the input data.
Reduction	is a function object performing the reduction.

Parameters

begin	defines range [begin, end) of indexes which will be used for the reduction.
end	defines range [begin, end) of indexes which will be used for the reduction.
fetch	is a lambda function fetching the input data.
reduction	is a function object defining the reduction operation. This can be a user-defined lambda function or an instance of some Function objects for reduction operations.
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

Returns: result of the reduction

The fetch lambda function takes one argument which is index of the element to be fetched:

auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };

The reduction lambda function takes two variables which are supposed to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };

Example

#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
template< typename Device >
double
sum( const Vector< double, Device >& v )
{
   /****
    * Get vector view which can be captured by lambda.
    */
   auto view = v.getConstView();
 
   /****
    * The fetch function just reads elements of vector v.
    */
   auto fetch = [ = ] __cuda_callable__( int i ) -> double
   {
      return view[ i ];
   };
 
   /***
    * Reduction is sum of two numbers.
    */
   auto reduction = [] __cuda_callable__( const double& a, const double& b )
   {
      return a + b;
   };
 
   /***
    * Finally we call the templated function Reduction and pass number of elements to reduce,
    * lambdas defined above and finally value of identity element, zero in this case, which serve for the
    * reduction initiation.
    */
   return reduce< Device >( 0, view.getSize(), fetch, reduction, 0.0 );
}
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the sum with vectors allocated on CPU.
    */
   Vector< double, Devices::Host > host_v( 10 );
   host_v = 1.0;
   std::cout << "host_v = " << host_v << std::endl;
   std::cout << "The sum of the host vector elements is " << sum( host_v ) << "." << std::endl;
 
   /***
    * And then also on GPU.
    */
#ifdef __CUDACC__
   Vector< double, Devices::Cuda > cuda_v( 10 );
   cuda_v = 1.0;
   std::cout << "cuda_v = " << cuda_v << std::endl;
   std::cout << "The sum of the CUDA vector elements is " << sum( cuda_v ) << "." << std::endl;
#endif
   return EXIT_SUCCESS;
}

Output

host_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

The sum of the host vector elements is 10.

cuda_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

The sum of the CUDA vector elements is 10.

◆ reduce() [4/4]

template<typename Device, typename Index, typename Fetch, typename Reduction = TNL::Plus>

auto TNL::Algorithms::reduce	(	Index	begin,
		Index	end,
		Fetch &&	fetch,
		Reduction &&	reduction = TNL::Plus{} )

Variant of reduce with a reduction function object instead of lambda function.

Template Parameters

Device	is a type of the device where the reduction will be performed. It can be on of the following: TNL::Devices::Sequential, TNL::Devices::Host, TNL::Devices::Cuda.
Index	is a type for indexing.
Fetch	is a lambda function for fetching the input data.
Reduction	is a function object performing the reduction.

Parameters

begin	defines range [begin, end) of indexes which will be used for the reduction.
end	defines range [begin, end) of indexes which will be used for the reduction.
fetch	is a lambda function fetching the input data.
reduction	is a function object defining the reduction operation, it must be an instance of some Function objects for reduction operations.

Returns: result of the reduction

The fetch lambda function takes one argument which is index of the element to be fetched:

auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };

Example

#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
template< typename Device >
double
sum( const Vector< double, Device >& v )
{
   /****
    * Get vector view which can be captured by lambda.
    */
   auto view = v.getConstView();
 
   /****
    * The fetch function just reads elements of vector v.
    */
   auto fetch = [ = ] __cuda_callable__( int i ) -> double
   {
      return view[ i ];
   };
 
   /***
    * Finally we call the templated function Reduction and pass number of elements to reduce,
    * lambda defined above and functional representing the reduction operation.
    */
   return reduce< Device >( 0, view.getSize(), fetch, TNL::Plus{} );
}
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the sum with vectors allocated on CPU.
    */
   Vector< double, Devices::Host > host_v( 10 );
   host_v = 1.0;
   std::cout << "host_v = " << host_v << std::endl;
   std::cout << "The sum of the host vector elements is " << sum( host_v ) << "." << std::endl;
 
   /***
    * And then also on GPU.
    */
#ifdef __CUDACC__
   Vector< double, Devices::Cuda > cuda_v( 10 );
   cuda_v = 1.0;
   std::cout << "cuda_v = " << cuda_v << std::endl;
   std::cout << "The sum of the CUDA vector elements is " << sum( cuda_v ) << "." << std::endl;
#endif
   return EXIT_SUCCESS;
}

Output

host_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

The sum of the host vector elements is 10.

cuda_v = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ]

The sum of the CUDA vector elements is 10.

◆ reduceWithArgument() [1/4]

template<typename Array, typename Device = typename Array::DeviceType, typename Reduction>

auto TNL::Algorithms::reduceWithArgument	(	const Array &	array,
		Reduction &&	reduction )

Variant of reduceWithArgument for arrays, views and compatible objects.

Reduction can be one of TNL::MinWithArg, TNL::MaxWithArg.

The referenced reduceWithArgument function is called with:

Device, which is typename Array::DeviceType by default, as the Device type,
0 as the beginning of the interval for reduction,
array.getSize() as the end of the interval for reduction,
array.getConstView() as the fetch functor,
reduction as the reduction operation, it must be an instance of some Function objects for reduction operations with argument
and the identity element obtained from the reduction function object.

Example

#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
 
using namespace TNL;
 
template< typename Device >
void
reduceArrayExample()
{
   /****
    * Create new arrays
    */
   const int size = 10;
   Containers::Vector< float, Device > a( size );
 
   /****
    * Initiate the elements of array `a`
    */
   a.forAllElements(
      [] __cuda_callable__( int i, float& value )
      {
         value = 3 - i;
      } );
 
   /****
    * Reduce all elements of array `a`
    */
   std::pair< float, int > result_total = Algorithms::reduceWithArgument( TNL::abs( a ), TNL::MaxWithArg{} );
 
   /****
    * Print the results
    */
   std::cout << " a = " << a << std::endl;
   std::cout << " abs-max of all elements = " << result_total.first << " at position " << result_total.second << std::endl;
}
 
int
main( int argc, char* argv[] )
{
   std::cout << "Running example on the host system: " << std::endl;
   reduceArrayExample< Devices::Host >();
 
#ifdef __CUDACC__
   std::cout << "Running example on the CUDA device: " << std::endl;
   reduceArrayExample< Devices::Cuda >();
#endif
}

Output

Running example on the host system: 
 a = [ 3, 2, 1, 0, -1, -2, -3, -4, -5, -6 ]
 abs-max of all elements = 6 at position 9
Running example on the CUDA device: 
 a = [ 3, 2, 1, 0, -1, -2, -3, -4, -5, -6 ]
 abs-max of all elements = 6 at position 9

◆ reduceWithArgument() [2/4]

template<typename Array, typename Device = typename Array::DeviceType, typename Reduction, typename Result>

auto TNL::Algorithms::reduceWithArgument	(	const Array &	array,
		Reduction &&	reduction,
		Result	identity )

Variant of reduceWithArgument for arrays, views and compatible objects.

The referenced reduceWithArgument function is called with:

Device, which is typename Array::DeviceType by default, as the Device type,
0 as the beginning of the interval for reduction,
array.getSize() as the end of the interval for reduction,
array.getConstView() as the fetch functor,
reduction as the reduction operation,
and identity as the identity element of the reduction.

Example

#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
 
using namespace TNL;
 
template< typename Device >
void
reduceArrayExample()
{
   /****
    * Create new arrays
    */
   const int size = 10;
   Containers::Vector< float, Device > a( size );
 
   /****
    * Initiate the elements of array `a`
    */
   a.forAllElements(
      [] __cuda_callable__( int i, float& value )
      {
         value = 3 - i;
      } );
 
   /****
    * Reduce all elements of array `a`
    */
   std::pair< float, int > result_total = Algorithms::reduceWithArgument( TNL::abs( a ), TNL::MaxWithArg{} );
 
   /****
    * Print the results
    */
   std::cout << " a = " << a << std::endl;
   std::cout << " abs-max of all elements = " << result_total.first << " at position " << result_total.second << std::endl;
}
 
int
main( int argc, char* argv[] )
{
   std::cout << "Running example on the host system: " << std::endl;
   reduceArrayExample< Devices::Host >();
 
#ifdef __CUDACC__
   std::cout << "Running example on the CUDA device: " << std::endl;
   reduceArrayExample< Devices::Cuda >();
#endif
}

Output

Running example on the host system: 
 a = [ 3, 2, 1, 0, -1, -2, -3, -4, -5, -6 ]
 abs-max of all elements = 6 at position 9
Running example on the CUDA device: 
 a = [ 3, 2, 1, 0, -1, -2, -3, -4, -5, -6 ]
 abs-max of all elements = 6 at position 9

◆ reduceWithArgument() [3/4]

template<typename Device, typename Index, typename Fetch, typename Reduction>

auto TNL::Algorithms::reduceWithArgument	(	Index	begin,
		Index	end,
		Fetch &&	fetch,
		Reduction &&	reduction )

Variant of reduceWithArgument with reduction function object instead of lambda function.

Template Parameters

Device	is a type of the device where the reduction will be performed. It can be on of the following: TNL::Devices::Sequential, TNL::Devices::Host, TNL::Devices::Cuda.
Index	is a type for indexing.
Result	is a type of the reduction result.
Reduction	is a function object performing the reduction.
Fetch	is a lambda function for fetching the input data.

Parameters

begin	defines range [begin, end) of indexes which will be used for the reduction.
end	defines range [begin, end) of indexes which will be used for the reduction.
fetch	is a lambda function fetching the input data.
reduction	is an instance of lambda function or a function object defining the reduction operation. This must be an instance of some Function objects for reduction operations with argument.

Returns: result of the reduction in a form of std::pair<Index, Result> structure, where pair.first is the element position and pair.second is the reduction result.

The fetch lambda function takes one argument which is index of the element to be fetched:

auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };

Example

#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
template< typename Device >
std::pair< double, int >
maximumNorm( const Vector< double, Device >& v )
{
   auto view = v.getConstView();
 
   auto fetch = [ = ] __cuda_callable__( int i )
   {
      return abs( view[ i ] );
   };
   return reduceWithArgument< Device >( 0, view.getSize(), fetch, TNL::MaxWithArg{} );
}
 
int
main( int argc, char* argv[] )
{
   Vector< double, Devices::Host > host_v( 10 );
   host_v.forAllElements(
      [] __cuda_callable__( int i, double& value )
      {
         value = i - 7;
      } );
   std::cout << "host_v = " << host_v << std::endl;
   auto maxNormHost = maximumNorm( host_v );
   std::cout << "The maximum norm of the host vector elements is " << maxNormHost.first << " at position " << maxNormHost.second
             << "." << std::endl;
#ifdef __CUDACC__
   Vector< double, Devices::Cuda > cuda_v( 10 );
   cuda_v.forAllElements(
      [] __cuda_callable__( int i, double& value )
      {
         value = i - 7;
      } );
   std::cout << "cuda_v = " << cuda_v << std::endl;
   auto maxNormCuda = maximumNorm( cuda_v );
   std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.first << " at position "
             << maxNormCuda.second << "." << std::endl;
#endif
   return EXIT_SUCCESS;
}

Output

host_v = [ -7, -6, -5, -4, -3, -2, -1, 0, 1, 2 ]

The maximum norm of the host vector elements is 7 at position 0.

cuda_v = [ -7, -6, -5, -4, -3, -2, -1, 0, 1, 2 ]

The maximum norm of the device vector elements is 7 at position 0.

◆ reduceWithArgument() [4/4]

template<typename Device, typename Index, typename Result, typename Fetch, typename Reduction>

std::pair< Result, Index > TNL::Algorithms::reduceWithArgument	(	Index	begin,
		Index	end,
		Fetch &&	fetch,
		Reduction &&	reduction,
		const Result &	identity )

Variant of reduce returning also the position of the element of interest.

For example, in case of computing minimal or maximal element in array/vector, the position of the element having given value can be obtained. This method is, however, more flexible.

Template Parameters

Device	is a type of the device where the reduction will be performed. It can be on of the following: TNL::Devices::Sequential, TNL::Devices::Host, TNL::Devices::Cuda.
Index	is a type for indexing.
Result	is a type of the reduction result.
Reduction	is a function object performing the reduction.
Fetch	is a lambda function for fetching the input data.

Parameters

begin	defines range [begin, end) of indexes which will be used for the reduction.
end	defines range [begin, end) of indexes which will be used for the reduction.
fetch	is a lambda function fetching the input data.
reduction	is an instance of lambda function or function object defining the reduction operation. This can be a user-defined lambda function or an instance of some Function objects for reduction operations with argument.
identity	is the identity element for the reduction operation, i.e. element which does not change the result of the reduction.

Returns: result of the reduction in a form of std::pair<Index, Result> structure, where pair.first is the element position and pair.second is the reduction result.

The fetch lambda function takes one argument which is index of the element to be fetched:

auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };

The reduction lambda function takes two variables which are supposed to be reduced:

auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };

Example

#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/reduce.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
template< typename Device >
std::pair< double, int >
maximumNorm( const Vector< double, Device >& v )
{
   auto view = v.getConstView();
 
   auto fetch = [ = ] __cuda_callable__( int i )
   {
      return abs( view[ i ] );
   };
   auto reduction = [] __cuda_callable__( double& a, const double& b, int& aIdx, const int& bIdx )
   {
      if( a < b ) {
         a = b;
         aIdx = bIdx;
      }
      else if( a == b && bIdx < aIdx )
         aIdx = bIdx;
   };
   return reduceWithArgument< Device >( 0, view.getSize(), fetch, reduction, std::numeric_limits< double >::lowest() );
}
 
int
main( int argc, char* argv[] )
{
   Vector< double, Devices::Host > host_v( 10 );
   host_v.forAllElements(
      [] __cuda_callable__( int i, double& value )
      {
         value = i - 7;
      } );
   std::cout << "host_v = " << host_v << std::endl;
   auto maxNormHost = maximumNorm( host_v );
   std::cout << "The maximum norm of the host vector elements is " << maxNormHost.first << " at position " << maxNormHost.second
             << "." << std::endl;
#ifdef __CUDACC__
   Vector< double, Devices::Cuda > cuda_v( 10 );
   cuda_v.forAllElements(
      [] __cuda_callable__( int i, double& value )
      {
         value = i - 7;
      } );
   std::cout << "cuda_v = " << cuda_v << std::endl;
   auto maxNormCuda = maximumNorm( cuda_v );
   std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.first << " at position "
             << maxNormCuda.second << "." << std::endl;
#endif
   return EXIT_SUCCESS;
}

Output

host_v = [ -7, -6, -5, -4, -3, -2, -1, 0, 1, 2 ]

The maximum norm of the host vector elements is 7 at position 0.

cuda_v = [ -7, -6, -5, -4, -3, -2, -1, 0, 1, 2 ]

The maximum norm of the device vector elements is 7 at position 0.

◆ sort() [1/2]

template<typename Array, typename Compare, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType>

void TNL::Algorithms::sort	(	Array &	array,
		const Compare &	compare,
		const Sorter &	sorter = Sorter{} )

Function for sorting elements of array or vector based on a user defined comparison lambda function.

Template Parameters

Array	is a type of container to be sorted. It can be, for example, TNL::Containers::Array, TNL::Containers::ArrayView, TNL::Containers::Vector, TNL::Containers::VectorView.
Compare	is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second. The lambda function is supposed to be defined as follows (`ValueType` is type of the array elements): auto compare = [] __cuda_callable__ ( const ValueType& a , const ValueType& b ) -> bool { return .... };
Sorter	is an algorithm for sorting. It can be TNL::Algorithms::Sorting::STLSort for sorting on host and TNL::Algorithms::Sorting::Quicksort or TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU.

Parameters

array	is an instance of array/array view/vector/vector view for sorting.
compare	is an instance of the lambda function for comparison of two elements.
sorter	is an instance of sorter.

Example

#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/sort.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
template< typename ArrayT >
void
sort( ArrayT& array )
{
   const int size = 10;
 
   /****
    * Fill the array with random integers.
    */
   Array< int > aux_array( size );
   srand( size + 2021 );
   parallelFor< Devices::Host >( 0,
                                 size,
                                 [ & ]( int i )
                                 {
                                    aux_array[ i ] = std::rand() % ( 2 * size );
                                 } );
   array = aux_array;
 
   std::cout << "Random array: " << array << std::endl;
 
   /****
    * Sort the array in ascending order.
    */
   sort( array,
         [] __cuda_callable__( int a, int b )
         {
            return a < b;
         } );
   std::cout << "Array sorted in ascending order:" << array << std::endl;
 
   /***
    * Sort the array in descending order.
    */
   sort( array,
         [] __cuda_callable__( int a, int b )
         {
            return a > b;
         } );
   std::cout << "Array sorted in descending order:" << array << std::endl;
}
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the sorting on CPU.
    */
   std::cout << "Sorting on CPU ... " << std::endl;
   Array< int, Devices::Host > host_array;
   sort( host_array );
 
#ifdef __CUDACC__
   /***
    * And then also on GPU.
    */
   std::cout << "Sorting on GPU ... " << std::endl;
   Array< int, Devices::Cuda > cuda_array;
   sort( cuda_array );
#endif
   return EXIT_SUCCESS;
}

Output

Sorting on CPU ... 
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]
Sorting on GPU ... 
Random array: [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Array sorted in ascending order:[ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Array sorted in descending order:[ 15, 14, 14, 11, 8, 5, 5, 2, 1, 0 ]

◆ sort() [2/2]

template<typename Device, typename Index, typename Compare, typename Swap, typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType>

void TNL::Algorithms::sort	(	const Index	begin,
		const Index	end,
		Compare &&	compare,
		Swap &&	swap,
		const Sorter &	sorter = Sorter{} )

Function for general sorting based on lambda functions for comparison and swaping of two elements..

Template Parameters

Device	is device on which the sorting algorithms should be executed.
Index	is type used for indexing of the sorted data.
Compare	is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given by indices representing their positions. The lambda function is supposed to be defined as follows: auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... };
Swap	is a lambda function for swaping of two elements which are ordered wrong way. Both elements are represented by indices as well. It supposed to be defined as: auto swap = [=] __cuda_callable__ ( const Index& a , const Index& b ) mutable { swap( ....); }; TNL::swap __cuda_callable__ constexpr void swap(Type &a, Type &b) This function swaps values of two parameters. Definition Math.h:496
Sorter	is an algorithm for sorting. It can be TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. Currently there is no algorithm for CPU :(.

Parameters

begin	is the first index of the range `[begin, end)` to be sorted.
end	is the end index of the range `[begin, end)` to be sorted.
compare	is an instance of the lambda function for comparison of two elements.
swap	is an instance of the lambda function for swapping of two elements.
sorter	is an instance of sorter.

Example

#include <iostream>
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/sort.h>
 
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
 
template< typename ArrayT >
void
sort( ArrayT& array )
{
   const int size = 10;
 
   /****
    * Fill the array with random integers.
    */
   Array< int > aux_array( size );
   srand( size + 2021 );
   parallelFor< Devices::Host >( 0,
                                 size,
                                 [ & ]( int i )
                                 {
                                    aux_array[ i ] = std::rand() % ( 2 * size );
                                 } );
   array = aux_array;
 
   /***
    * Prepare second array holding elements positions.
    */
   ArrayT index( size );
   index.forAllElements(
      [] __cuda_callable__( int idx, int& value )
      {
         value = idx;
      } );
   std::cout << "Random array:     " << array << std::endl;
   std::cout << "Index array:      " << index << std::endl;
 
   /***
    * Sort the array `array` and apply the same permutation on the array `identity`.
    */
   auto array_view = array.getView();
   auto index_view = index.getView();
   sort< typename ArrayT::DeviceType,   // device on which the sorting will be performed
         typename ArrayT::IndexType >(  // type used for indexing
      0,
      size,                                              // range of indexes
      [ = ] __cuda_callable__( int i, int j ) -> bool {  // comparison lambda function
         return array_view[ i ] < array_view[ j ];
      },
      [ = ] __cuda_callable__( int i, int j ) mutable {  // lambda function for swapping of elements
         TNL::swap( array_view[ i ], array_view[ j ] );
         TNL::swap( index_view[ i ], index_view[ j ] );
      } );
   std::cout << "Sorted array:      " << array << std::endl;
   std::cout << "Index:             " << index << std::endl;
}
 
int
main( int argc, char* argv[] )
{
   /***
    * Firstly, test the sorting on CPU.
    */
   std::cout << "Sorting on CPU ... " << std::endl;
   Array< int, Devices::Host > host_array;
   sort( host_array );
 
#ifdef __CUDACC__
   /***
    * And then also on GPU.
    */
   std::cout << "Sorting on GPU ... " << std::endl;
   Array< int, Devices::Cuda > cuda_array;
   sort( cuda_array );
#endif
   return EXIT_SUCCESS;
}

Output

Sorting on CPU ... 
Random array:     [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Index array:      [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
Sorted array:      [ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Index:             [ 4, 1, 6, 3, 0, 9, 5, 8, 7, 2 ]
Sorting on GPU ... 
Random array:     [ 5, 1, 15, 5, 0, 11, 2, 14, 14, 8 ]
Index array:      [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ]
Sorted array:      [ 0, 1, 2, 5, 5, 8, 11, 14, 14, 15 ]
Index:             [ 4, 1, 6, 0, 3, 9, 5, 7, 8, 2 ]

◆ staticFor()

template<typename Index, Index begin, Index end, typename Func, typename... ArgTypes>

void TNL::Algorithms::staticFor	(	Func &&	f,
		ArgTypes &&...	args )

constexpr

Generic loop with constant bounds and indices usable in constant expressions.

staticFor is a generic C++17 implementation of a static for-loop using constexpr functions and template metaprogramming. It is equivalent to executing a function f(i, args...) for arguments i from the integral range [begin, end), but with the type std::integral_constant rather than int or std::size_t representing the indices. Hence, each index has its own distinct C++ type and the value of the index can be deduced from the type. The args... are additional user-supplied arguments that are forwarded to the staticFor function.

Also note that thanks to constexpr cast operator, the argument i can be used in constant expressions and the staticFor function can be used from the host code as well as CUDA kernels (TNL requires the --expt-relaxed-constexpr parameter when compiled by nvcc).

Template Parameters

Index	is the type of the loop indices.
begin	is the left bound of the iteration range `[begin, end)`.
end	is the right bound of the iteration range `[begin, end)`.
Func	is the type of the functor (it is usually deduced from the argument used in the function call).
ArgTypes	are the types of additional arguments passed to the function.

Parameters

f	is the functor to be called in each iteration.
args	are additional user-supplied arguments that are forwarded to each call of f.

Example: #include <iostream>

#include <array>

#include <tuple>

#include <TNL/Algorithms/staticFor.h>

/*

* Example function printing members of std::tuple using staticFor

* using lambda with capture.

*/

template< typename... Ts >

void

printTuple( const std::tuple< Ts... >& tupleVar )

{

std::cout << "{ ";

TNL::Algorithms::staticFor< size_t, 0, sizeof...( Ts ) >(

[ & ]( auto i )

{

std::cout << std::get< i >( tupleVar );

if( i < sizeof...( Ts ) - 1 )

std::cout << ", ";

} );

std::cout << " }" << std::endl;

}

struct TuplePrinter

{

constexpr TuplePrinter() = default;

template< typename Index, typename... Ts >

void

operator()( Index i, const std::tuple< Ts... >& tupleVar )

{

std::cout << std::get< i >( tupleVar );

if( i < sizeof...( Ts ) - 1 )

std::cout << ", ";

}

};

/*

* Example function printing members of std::tuple using staticFor

* and a structure with templated operator().

*/

template< typename... Ts >

void

printTupleCallableStruct( const std::tuple< Ts... >& tupleVar )

{

std::cout << "{ ";

TNL::Algorithms::staticFor< size_t, 0, sizeof...( Ts ) >( TuplePrinter(), tupleVar );

std::cout << " }" << std::endl;

}

int

main( int argc, char* argv[] )

{

// initiate std::array

std::array< int, 5 > a{ 1, 2, 3, 4, 5 };

// print out the array using template parameters for indexing

TNL::Algorithms::staticFor< int, 0, 5 >(

[ &a ]( auto i )

{

std::cout << "a[ " << i << " ] = " << std::get< i >( a ) << std::endl;

} );

// example of printing a tuple using staticFor and a lambda function

printTuple( std::make_tuple( "Hello", 3, 2.1 ) );

// example of printing a tuple using staticFor and a structure with templated operator()

printTupleCallableStruct( std::make_tuple( "Hello", 3, 2.1 ) );

}

std::array

std::make_tuple
T make_tuple(T... args)

TNL::Algorithms::staticFor
constexpr void staticFor(Func &&f, ArgTypes &&... args)
Generic loop with constant bounds and indices usable in constant expressions.
Definition staticFor.h:60

std::tuple

Output: a[ 0 ] = 1

a[ 1 ] = 2

a[ 2 ] = 3

a[ 3 ] = 4

a[ 4 ] = 5

{ Hello, 3, 2.1 }

{ Hello, 3, 2.1 }

◆ unrolledFor()

template<typename Index, Index begin, Index end, Index unrollFactor = 8, typename Func>

void TNL::Algorithms::unrolledFor ( Func && f )

constexpr

Generic for-loop with explicit unrolling.

unrolledFor performs explicit loop unrolling of short loops which can improve performance in some cases. The bounds of the for-loop must be constant (i.e. known at the compile time). Loops longer than unrollFactor are not unrolled and executed as a normal for-loop.

The unroll factor is configurable, but note that full unrolling does not make sense for very long loops. It might even trigger the compiler's limit on recursive template instantiation. Also note that the compiler will (at least partially) unroll loops with static bounds anyway.

Template Parameters

Index	is the type of the loop indices.
begin	is the left bound of the iteration range `[begin, end)`.
end	is the right bound of the iteration range `[begin, end)`.
unrollFactor	is the maximum length of loops to fully unroll via recursive template instantiation.
Func	is the type of the functor (it is usually deduced from the argument used in the function call).

Parameters

f	is the functor to be called in each iteration.

Example: #include <iostream>

#include <TNL/Containers/StaticVector.h>

#include <TNL/Algorithms/unrolledFor.h>

using namespace TNL;

using namespace TNL::Containers;

int

main( int argc, char* argv[] )

{

/****

* Create two static vectors

*/

const int Size( 3 );

StaticVector< Size, double > a, b;

a = 1.0;

b = 2.0;

double sum( 0.0 );

/****

* Compute an addition of a vector and a constant number.

*/

Algorithms::unrolledFor< int, 0, Size >(

[ & ]( int i )

{

a[ i ] = b[ i ] + 3.14;

sum += a[ i ];

} );

std::cout << "a = " << a << std::endl;

std::cout << "sum = " << sum << std::endl;

}

TNL::Containers::StaticVector
Vector with constant size.
Definition StaticVector.h:19

TNL::Algorithms::unrolledFor
constexpr void unrolledFor(Func &&f)
Generic for-loop with explicit unrolling.
Definition unrolledFor.h:77

Output: a = [ 5.14, 5.14, 5.14 ]

sum = 15.42

Namespaces

Classes

Functions

Detailed Description

Function Documentation

◆ ascendingSort()

◆ contains()

◆ containsOnlyValue()

◆ copy() [1/3]

◆ copy() [2/3]

◆ copy() [3/3]

◆ descendingSort()

◆ distributedExclusiveScan() [1/2]

◆ distributedExclusiveScan() [2/2]

◆ distributedInclusiveScan() [1/2]

◆ distributedInclusiveScan() [2/2]

◆ distributedInplaceExclusiveScan() [1/2]

◆ distributedInplaceExclusiveScan() [2/2]

◆ distributedInplaceInclusiveScan() [1/2]

◆ distributedInplaceInclusiveScan() [2/2]

◆ equal()

◆ exclusiveScan() [1/2]

◆ exclusiveScan() [2/2]

◆ fill()

◆ fillRandom()

◆ find()

◆ inclusiveScan() [1/2]

◆ inclusiveScan() [2/2]

◆ inplaceExclusiveScan() [1/2]

◆ inplaceExclusiveScan() [2/2]

◆ inplaceInclusiveScan() [1/2]

◆ inplaceInclusiveScan() [2/2]

◆ isAscending()

◆ isDescending()

◆ isSorted()

◆ parallelFor() [1/2]

◆ parallelFor() [2/2]

◆ reduce() [1/4]

◆ reduce() [2/4]

◆ reduce() [3/4]

◆ reduce() [4/4]

◆ reduceWithArgument() [1/4]

◆ reduceWithArgument() [2/4]

◆ reduceWithArgument() [3/4]

◆ reduceWithArgument() [4/4]

◆ sort() [1/2]

◆ sort() [2/2]

◆ staticFor()

◆ unrolledFor()