CUDPP
2.2
CUDA Data-Parallel Primitives Library
|
CUDPP kernel-level radix sorting routines. More...
#include "cudpp_radixsort.h"
#include <cudpp_globals.h>
#include "sharedmem.h"
#include "cta/radixsort_cta.cuh"
RadixSort Functions | |
typedef unsigned int | uint |
__global__ void | emptyKernel () |
And empty kernel used to reset CTA issue hardware. | |
__global__ void | flipFloats (uint *values, uint numValues) |
Does special binary arithmetic before sorting floats. More... | |
__global__ void | unflipFloats (uint *values, uint numValues) |
Undoes the flips from flipFloats. More... | |
template<bool flip> | |
__global__ void | radixSortSingleWarp (uint *keys, uint *values, uint numElements) |
Optimization for sorts of WARP_SIZE or fewer elements. More... | |
template<bool flip> | |
__global__ void | radixSortSingleWarpKeysOnly (uint *keys, uint numElements) |
Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version. More... | |
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop> | |
__global__ void | radixSortBlocks (uint4 *keysOut, uint4 *valuesOut, uint4 *keysIn, uint4 *valuesIn, uint numElements, uint totalBlocks) |
sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements More... | |
template<uint startbit, bool fullBlocks, bool loop> | |
__global__ void | findRadixOffsets (uint2 *keys, uint *counters, uint *blockOffsets, uint numElements, uint totalBlocks) |
Computes the number of keys of each radix in each block stores offset. More... | |
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop> | |
__global__ void | reorderData (uint *outKeys, uint *outValues, uint2 *keys, uint2 *values, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks) |
Reorders data in the global array. More... | |
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop> | |
__global__ void | radixSortBlocksKeysOnly (uint4 *keysOut, uint4 *keysIn, uint numElements, uint totalBlocks) |
Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements. More... | |
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop> | |
__global__ void | reorderDataKeysOnly (uint *outKeys, uint2 *keys, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks) |
Reorders data in the global array. More... | |
CUDPP kernel-level radix sorting routines.
radixsort_kernel.cu