CUDPP 1.1
|
CUDPP kernel-level radix sorting routines. More...
#include "cudpp_radixsort.h"
#include <cudpp_globals.h>
#include "sharedmem.h"
#include "cta/radixsort_cta.cu"
RadixSort Functions | |
typedef unsigned int | uint |
__global__ void | emptyKernel () |
And empty kernel used to reset CTA issue hardware. | |
__global__ void | flipFloats (uint *values, uint numValues) |
Does special binary arithmetic before sorting floats. | |
__global__ void | unflipFloats (uint *values, uint numValues) |
Undoes the flips from flipFloats. | |
template<bool flip> | |
__global__ void | radixSortSingleWarp (uint *keys, uint *values, uint numElements) |
Optimization for sorts of WARP_SIZE or fewer elements. | |
template<bool flip> | |
__global__ void | radixSortSingleWarpKeysOnly (uint *keys, uint numElements) |
Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version. | |
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop> | |
__global__ void | radixSortBlocks (uint4 *keysOut, uint4 *valuesOut, uint4 *keysIn, uint4 *valuesIn, uint numElements, uint totalBlocks) |
sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements | |
template<uint startbit, bool fullBlocks, bool loop> | |
__global__ void | findRadixOffsets (uint2 *keys, uint *counters, uint *blockOffsets, uint numElements, uint totalBlocks) |
Computes the number of keys of each radix in each block stores offset. | |
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop> | |
__global__ void | reorderData (uint *outKeys, uint *outValues, uint2 *keys, uint2 *values, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks) |
Reorders data in the global array. | |
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop> | |
__global__ void | radixSortBlocksKeysOnly (uint4 *keysOut, uint4 *keysIn, uint numElements, uint totalBlocks) |
Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements. | |
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop> | |
__global__ void | reorderDataKeysOnly (uint *outKeys, uint2 *keys, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks) |
Reorders data in the global array. |
CUDPP kernel-level radix sorting routines.