CUDPP
2.2
CUDA Data-Parallel Primitives Library
|
CUDPP CTA-level sort routines. More...
#include <cudpp_globals.h>
#include "cudpp_radixsort.h"
#include "cta/scan_cta.cuh"
#include <cudpp.h>
#include <stdio.h>
#include <cudpp_util.h>
#include <math.h>
#include "sharedmem.h"
Radix Sort Functions | |
typedef unsigned int | uint |
template<bool doFlip> | |
__device__ uint | floatFlip (uint f) |
Flips bits of single-precision floating-point number (parameterized by doFlip) More... | |
template<bool doFlip> | |
__device__ uint | floatUnflip (uint f) |
Reverses bit-flip of single-precision floating-point number (parameterized by doFlip) More... | |
template<class T , int maxlevel> | |
__device__ T | scanwarp (T val, volatile T *sData) |
Scans one warp quickly, optimized for 32-element warps, using shared memory. More... | |
__device__ uint4 | scan4 (uint4 idata) |
Scans 4*CTA_SIZE unsigned ints in a block. More... | |
template<int ctasize> | |
__device__ uint4 | rank4 (uint4 preds) |
Computes output position for each thread given predicate; trues come first then falses. More... | |
template<uint nbits, uint startbit> | |
__device__ void | radixSortBlock (uint4 &key, uint4 &value) |
Sorts one block. More... | |
template<uint nbits, uint startbit> | |
__device__ void | radixSortBlockKeysOnly (uint4 &key) |
Sorts one block. Key-only version. More... | |
CUDPP CTA-level sort routines.
sort_cta.cu