|
CUDPP
2.2
CUDA Data-Parallel Primitives Library
|
CUDPP kernel-level radix sorting routines. More...
#include "cudpp_radixsort.h"#include <cudpp_globals.h>#include "sharedmem.h"#include "cta/radixsort_cta.cuh"RadixSort Functions | |
| typedef unsigned int | uint |
| __global__ void | emptyKernel () |
| And empty kernel used to reset CTA issue hardware. | |
| __global__ void | flipFloats (uint *values, uint numValues) |
| Does special binary arithmetic before sorting floats. More... | |
| __global__ void | unflipFloats (uint *values, uint numValues) |
| Undoes the flips from flipFloats. More... | |
| template<bool flip> | |
| __global__ void | radixSortSingleWarp (uint *keys, uint *values, uint numElements) |
| Optimization for sorts of WARP_SIZE or fewer elements. More... | |
| template<bool flip> | |
| __global__ void | radixSortSingleWarpKeysOnly (uint *keys, uint numElements) |
| Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version. More... | |
| template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop> | |
| __global__ void | radixSortBlocks (uint4 *keysOut, uint4 *valuesOut, uint4 *keysIn, uint4 *valuesIn, uint numElements, uint totalBlocks) |
| sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements More... | |
| template<uint startbit, bool fullBlocks, bool loop> | |
| __global__ void | findRadixOffsets (uint2 *keys, uint *counters, uint *blockOffsets, uint numElements, uint totalBlocks) |
| Computes the number of keys of each radix in each block stores offset. More... | |
| template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop> | |
| __global__ void | reorderData (uint *outKeys, uint *outValues, uint2 *keys, uint2 *values, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks) |
| Reorders data in the global array. More... | |
| template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop> | |
| __global__ void | radixSortBlocksKeysOnly (uint4 *keysOut, uint4 *keysIn, uint numElements, uint totalBlocks) |
| Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements. More... | |
| template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop> | |
| __global__ void | reorderDataKeysOnly (uint *outKeys, uint2 *keys, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks) |
| Reorders data in the global array. More... | |
CUDPP kernel-level radix sorting routines.
radixsort_kernel.cu
1.8.6