CUDPP  2.3
CUDA Data-Parallel Primitives Library
radixsort_kernel.cuh File Reference

CUDPP kernel-level radix sorting routines. More...

#include "cudpp_radixsort.h"
#include <cudpp_globals.h>
#include "sharedmem.h"
#include "cta/radixsort_cta.cuh"

RadixSort Functions

typedef unsigned int uint
 
__global__ void emptyKernel ()
 And empty kernel used to reset CTA issue hardware.
 
__global__ void flipFloats (uint *values, uint numValues)
 Does special binary arithmetic before sorting floats. More...
 
__global__ void unflipFloats (uint *values, uint numValues)
 Undoes the flips from flipFloats. More...
 
template<bool flip>
__global__ void radixSortSingleWarp (uint *keys, uint *values, uint numElements)
 Optimization for sorts of WARP_SIZE or fewer elements. More...
 
template<bool flip>
__global__ void radixSortSingleWarpKeysOnly (uint *keys, uint numElements)
 Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version. More...
 
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void radixSortBlocks (uint4 *keysOut, uint4 *valuesOut, uint4 *keysIn, uint4 *valuesIn, uint numElements, uint totalBlocks)
 sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements More...
 
template<uint startbit, bool fullBlocks, bool loop>
__global__ void findRadixOffsets (uint2 *keys, uint *counters, uint *blockOffsets, uint numElements, uint totalBlocks)
 Computes the number of keys of each radix in each block stores offset. More...
 
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void reorderData (uint *outKeys, uint *outValues, uint2 *keys, uint2 *values, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks)
 Reorders data in the global array. More...
 
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void radixSortBlocksKeysOnly (uint4 *keysOut, uint4 *keysIn, uint numElements, uint totalBlocks)
 Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements. More...
 
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void reorderDataKeysOnly (uint *outKeys, uint2 *keys, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks)
 Reorders data in the global array. More...
 

Detailed Description

CUDPP kernel-level radix sorting routines.

radixsort_kernel.cu