CUDPP
2.1
CUDA Data-Parallel Primitives Library
|
CUDPP CTA-level scan routines. More...
Classes | |
class | SegmentedScanTraits< T, Oper, backward, exclusivity, doShiftFlags, fullBlock, sums, sm12OrBetter > |
Template class containing compile-time parameters to the segmented scan functions. More... | |
Functions | |
Segmented scan Functions | |
template<class T , typename traits > | |
__device__ void | loadForSegmentedScanSharedChunkFromMem4 (T *s_odata, T threadScan0[4], T threadScan1[4], unsigned int &threadFlag, unsigned int *s_oflags, unsigned int *s_oindices, const T *d_idata, const unsigned int *d_iflags, int numElements, int iDataOffset, int &ai, int &bi, int &aiDev, int &biDev) |
Handles loading input s_data from global memory to shared memory (vec4 version) More... | |
template<class T , class traits > | |
__device__ void | storeForSegmentedScanSharedChunkToMem4 (T *d_odata, T threadScan0[4], T threadScan1[4], unsigned int threadFlag, T *s_idata, unsigned int numElements, int oDataOffset, int ai, int bi, int aiDev, int biDev) |
Handles storing result s_data from shared memory to global memory (vec4 version) More... | |
template<class T , class traits , unsigned int blockSize> | |
__device__ T | reduceCTA (volatile T *s_data) |
template<class T , class traits , bool isExclusive, unsigned int log_simd_threads> | |
__device__ void | warpSegScan (T val, unsigned int flag, volatile T *s_data, volatile unsigned int *s_flags, T &oVal, unsigned int &oFlag, bool print=false) |
template<class T , class traits > | |
__device__ void | segmentedScanWarps (T val1, unsigned int flag1, T val2, unsigned int flag2, T *s_data, unsigned int *s_flags) |
template<class T , class traits > | |
__device__ void | segmentedScanCTA (T *s_data, unsigned int *s_flags, unsigned int *s_indices, T *d_blockSums=0, unsigned int *d_blockFlags=0, unsigned int *d_blockIndices=0) |
CTA-level segmented scan routine;. More... | |
CUDPP CTA-level scan routines.
segmented_scan_cta.cu