Template Numerical Library version\ main:f17d0c8
|
Public Types | |
using | DeviceType = Devices::Cuda |
using | ValueType = Value |
Public Member Functions | |
template<typename CMP > | |
void | firstPhase (const CMP &Cmp) |
int | getElemPerBlock () const |
int | getSetsNeeded (int elemPerBlock) const |
void | init (Containers::ArrayView< Value, Devices::Cuda > arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable) |
template<typename CMP > | |
int | initTasks (int elemPerBlock, const CMP &Cmp) |
template<typename CMP > | |
void | performSort (const CMP &Cmp) |
void | processNewTasks () |
template<typename CMP > | |
void | secondPhase (const CMP &Cmp) |
template<typename Array > | |
void | sort (Array &arr) |
template<typename Array , typename Compare > | |
void | sort (Array &arr, const Compare &cmp) |
Protected Attributes | |
Containers::ArrayView< Value, Devices::Cuda > | arr |
Containers::ArrayView< Value, Devices::Cuda > | aux |
Containers::Array< Value, Devices::Cuda > | auxMem |
Containers::Array< TASK, Devices::Cuda > | cuda_2ndPhaseTasks |
Containers::Array< int, Devices::Cuda > | cuda_2ndPhaseTasksAmount |
Containers::Array< int, Devices::Cuda > | cuda_blockToTaskMapping |
Containers::Array< TASK, Devices::Cuda > | cuda_newTasks |
Containers::Array< int, Devices::Cuda > | cuda_newTasksAmount |
Containers::Array< int, Devices::Cuda > | cuda_reductionTaskInitMem |
Containers::Array< TASK, Devices::Cuda > | cuda_tasks |
int | desired_2ndPhasElemPerBlock |
int | desiredElemPerBlock |
const int | g_maxTasks = 1 << 14 |
int | host_1stPhaseTasksAmount = 0 |
int | host_2ndPhaseTasksAmount = 0 |
int | iteration = 0 |
int | maxBlocks |
std::size_t | maxSharable |
int | maxTasks |
int | threadsPerBlock |
void TNL::Algorithms::Sorting::Quicksorter< Value, Devices::Cuda >::firstPhase | ( | const CMP & | Cmp | ) |
does the 1st phase of Quicksort until out of task memory or each task is small enough for correctness, secondphase method needs to be called to sort each subsequences
initializes tasks so that each block knows which task to work on and which part of array to split also sets pivot needed for partitioning, this is why Cmp is needed
check if partition procedure can use shared memory for coalesced write after reordering
move elements smaller than pivot to the left and bigger to the right note: pivot isnt inserted in the middle yet
fill in the gap between smaller and bigger with elements == pivot after writing also create new tasks, each task generates at max 2 tasks
tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase bigger need more blocks to partition and are written into newTask with iteration %2, rotate between the 2 tasks array to save from copying
int TNL::Algorithms::Sorting::Quicksorter< Value, Devices::Cuda >::getElemPerBlock | ( | ) | const |
returns the optimal amount of elements per thread needed for phase
int TNL::Algorithms::Sorting::Quicksorter< Value, Devices::Cuda >::getSetsNeeded | ( | int | elemPerBlock | ) | const |
returns how many blocks are needed to start sort phase 1 if
elemPerBlock | were to be used |
int TNL::Algorithms::Sorting::Quicksorter< Value, Devices::Cuda >::initTasks | ( | int | elemPerBlock, |
const CMP & | Cmp ) |
returns the amount of blocks needed to start phase 1 while also initializing all tasks
void TNL::Algorithms::Sorting::Quicksorter< Value, Devices::Cuda >::processNewTasks | ( | ) |
update necessary variables after 1 phase1 sort
void TNL::Algorithms::Sorting::Quicksorter< Value, Devices::Cuda >::secondPhase | ( | const CMP & | Cmp | ) |
sorts all leftover tasks
void TNL::Algorithms::Sorting::Quicksorter< Value, Devices::Cuda >::sort | ( | Array & | arr, |
const Compare & | cmp ) |
for every block there is a bit of shared memory reserved, the actual value can slightly differ
the goal is to use shared memory as often as possible each thread in a block will process n elements, n==multiplier
blockDim*multiplier*sizeof(Value) + 1*sizeof(Value) <= maxSharable