CUDPP 2.0
CUDA Data-Parallel Primitives Library
Classes
CUDPP CTA-Level API

Classes

class  ScanTraits< T, Oper, backward, exclusive, multiRow, sums, fullBlock >
 Template class containing compile-time parameters to the scan functions. More...
class  SegmentedScanTraits< T, Oper, backward, exclusivity, doShiftFlags, fullBlock, sums, sm12OrBetter >
 Template class containing compile-time parameters to the segmented scan functions. More...

Radix Sort Functions

typedef unsigned int uint
template<bool doFlip>
__device__ uint floatFlip (uint f)
 Flips bits of single-precision floating-point number (parameterized by doFlip)
template<bool doFlip>
__device__ uint floatUnflip (uint f)
 Reverses bit-flip of single-precision floating-point number (parameterized by doFlip)
template<class T , int maxlevel>
__device__ T scanwarp (T val, volatile T *sData)
 Scans one warp quickly, optimized for 32-element warps, using shared memory.
__device__ uint4 scan4 (uint4 idata)
 Scans 4*CTA_SIZE unsigned ints in a block.
template<int ctasize>
__device__ uint4 rank4 (uint4 preds)
 Computes output position for each thread given predicate; trues come first then falses.
template<uint nbits, uint startbit>
__device__ void radixSortBlock (uint4 &key, uint4 &value)
 Sorts one block.
template<uint nbits, uint startbit>
__device__ void radixSortBlockKeysOnly (uint4 &key)
 Sorts one block. Key-only version.

Rand Functions

__device__ void swizzleShift (uint4 *f)
 Does a GLSL-style swizzle assigning f->xyzw = f->yzwx.
__device__ unsigned int leftRotate (unsigned int x, unsigned int n)
 Rotates the bits in x over by n bits.
__device__ unsigned int F (unsigned int x, unsigned int y, unsigned int z)
 The F scrambling function.
__device__ unsigned int G (unsigned int x, unsigned int y, unsigned int z)
 The G scrambling function.
__device__ unsigned int H (unsigned int x, unsigned int y, unsigned int z)
 The H scrambling function.
__device__ unsigned int I (unsigned int x, unsigned int y, unsigned int z)
 The I scrambling function.
__device__ void FF (uint4 *td, int i, uint4 *Fr, float p, unsigned int *data)
 The FF scrambling function.
__device__ void GG (uint4 *td, int i, uint4 *Gr, float p, unsigned int *data)
 The GG scrambling function.
__device__ void HH (uint4 *td, int i, uint4 *Hr, float p, unsigned int *data)
 The HH scrambling function.
__device__ void II (uint4 *td, int i, uint4 *Ir, float p, unsigned int *data)
 The II scrambling function.
__device__ void setupInput (unsigned int *input, unsigned int seed)
 Sets up the input array using information of seed, and threadIdx.

Scan Functions

template<class T , class traits >
__device__ void loadSharedChunkFromMem4 (T *s_out, T threadScan[2][4], const T *d_in, int numElements, int iDataOffset, int &ai, int &bi, int &aiDev, int &biDev)
 Handles loading input s_data from global memory to shared memory (vec4 version)
template<class T , class traits >
__device__ void storeSharedChunkToMem4 (T *d_out, T threadScan[2][4], T *s_in, int numElements, int oDataOffset, int ai, int bi, int aiDev, int biDev)
 Handles storing result s_data from shared memory to global memory (vec4 version)
template<class T , class traits >
__device__ void loadSharedChunkFromMem2 (T *s_out, T threadScan[2][2], const T *d_in, int numElements, int iDataOffset, int &ai, int &bi, int &aiDev, int &biDev)
 Handles loading input s_data from global memory to shared memory (vec4 version)
template<class T , class traits >
__device__ void storeSharedChunkToMem2 (T *d_out, T threadScan[2][2], T *s_in, int numElements, int oDataOffset, int ai, int bi, int aiDev, int biDev)
 Handles storing result s_data from shared memory to global memory (vec4 version)
template<class T , class traits , int maxlevel>
__device__ T warpscan (T val, volatile T *s_data)
 Scan all warps of a CTA without synchronization.
template<class T , class traits >
__device__ void scanWarps (T x, T y, T *s_data)
 Perform a full CTA scan using the warp-scan algorithm.
template<class T , class traits >
__device__ void scanCTA (T *s_data, T *d_blockSums, unsigned int blockSumIndex)
 CTA-level scan routine; scans s_data in shared memory in each thread block.
#define DISALLOW_LOADSTORE_OVERLAP   1

Segmented scan Functions

template<class T , typename traits >
__device__ void loadForSegmentedScanSharedChunkFromMem4 (T *s_odata, T threadScan0[4], T threadScan1[4], unsigned int &threadFlag, unsigned int *s_oflags, unsigned int *s_oindices, const T *d_idata, const unsigned int *d_iflags, int numElements, int iDataOffset, int &ai, int &bi, int &aiDev, int &biDev)
 Handles loading input s_data from global memory to shared memory (vec4 version)
template<class T , class traits >
__device__ void storeForSegmentedScanSharedChunkToMem4 (T *d_odata, T threadScan0[4], T threadScan1[4], unsigned int threadFlag, T *s_idata, unsigned int numElements, int oDataOffset, int ai, int bi, int aiDev, int biDev)
 Handles storing result s_data from shared memory to global memory (vec4 version)
template<class T , class traits , unsigned int blockSize>
__device__ T reduceCTA (volatile T *s_data)
template<class T , class traits , bool isExclusive, unsigned int log_simd_threads>
__device__ void warpSegScan (T val, unsigned int flag, volatile T *s_data, volatile unsigned int *s_flags, T &oVal, unsigned int &oFlag, bool print=false)
template<class T , class traits >
__device__ void segmentedScanWarps (T val1, unsigned int flag1, T val2, unsigned int flag2, T *s_data, unsigned int *s_flags)
template<class T , class traits >
__device__ void segmentedScanCTA (T *s_data, unsigned int *s_flags, unsigned int *s_indices, T *d_blockSums=0, unsigned int *d_blockFlags=0, unsigned int *d_blockIndices=0)
 CTA-level segmented scan routine;.

Detailed Description

The CUDPP CTA-Level API contains functions that run on the GPU device. These are CUDA __device__ functions that are called from within other CUDA device functions (typically CUDPP Kernel-Level API functions). They are called CTA-level functions because they typically process s_data "owned" by each CTA within shared memory, and are agnostic of any other CTAs that may be running (or how many CTAs are running), other than to compute appropriate global memory addresses.


Define Documentation

#define DISALLOW_LOADSTORE_OVERLAP   1

This is used to insert syncthreads to avoid perf loss caused by 128-bit load overlap that happens on G80. This gives about a 15% boost on scans on G80.

Todo:
Parameterize this in case this perf detail changes on future GPUs.

Function Documentation

template<bool doFlip>
__device__ uint floatFlip ( uint  f)

Flips bits of single-precision floating-point number (parameterized by doFlip)

flip a float for sorting finds SIGN of fp number. if it's 1 (negative float), it flips all bits if it's 0 (positive float), it flips the sign only

Parameters:
[in]ffloating-point input (passed as unsigned int)
Returns:
uint that stores the flipped version of the input
See also:
floatUnflip
template<bool doFlip>
__device__ uint floatUnflip ( uint  f)

Reverses bit-flip of single-precision floating-point number (parameterized by doFlip)

flip a float back (invert FloatFlip) signed was flipped from above, so: if sign is 1 (negative), it flips the sign bit back if sign is 0 (positive), it flips all bits back

Parameters:
[in]ffloating-point input (passed as unsigned int)
Returns:
uint that stores the unflipped version of the input
See also:
floatFlip
template<class T , int maxlevel>
__device__ T scanwarp ( val,
volatile T *  sData 
)

Scans one warp quickly, optimized for 32-element warps, using shared memory.

Scans each warp in parallel ("warp-scan"), one element per thread. uses 2 numElements of shared memory per thread (64 numElements per warp)

Parameters:
[in]valElements per thread to scan
[in,out]sData
Returns:
Scanned input warp
__device__ uint4 scan4 ( uint4  idata)

Scans 4*CTA_SIZE unsigned ints in a block.

scan4 scans 4*CTA_SIZE numElements in a block (4 per thread), using a warp-scan algorithm

Parameters:
[in]idata4-vector of integers to scan
Returns:
Scanned input 4-vector of integers
template<int ctasize>
__device__ uint4 rank4 ( uint4  preds)

Computes output position for each thread given predicate; trues come first then falses.

Rank is the core of the radix sort loop. Given a predicate, it computes the output position for each thread in an ordering where all True threads come first, followed by all False threads. This version handles 4 predicates per thread; hence, "rank4".

Parameters:
[in]predstrue/false values for each of the 4 elements in this thread
Returns:
Output position for each thread
Todo:
is the description of "preds" correct?
template<uint nbits, uint startbit>
__device__ void radixSortBlock ( uint4 &  key,
uint4 &  value 
)

Sorts one block.

Uses rank to sort one bit at a time: Sorts a block according to bits startbit -> nbits + startbit

Parameters:
[in,out]key
[in,out]value
Returns:
Sorted key/value block with respect to startbit and nbits
template<uint nbits, uint startbit>
__device__ void radixSortBlockKeysOnly ( uint4 &  key)

Sorts one block. Key-only version.

Uses rank to sort one bit at a time: Sorts a block according to bits startbit -> nbits + startbit

Parameters:
[in,out]key
Returns:
Sorted key-only block with respect to startbit and nbits
__device__ void swizzleShift ( uint4 *  f)

Does a GLSL-style swizzle assigning f->xyzw = f->yzwx.

It does the equvalent of f->xyzw = f->yzwx since this functionality is in shading languages but not exposed in CUDA.

Parameters:
[in]fthe uint4 data type which will have its elements shifted. Passed in as pointer.
__device__ unsigned int leftRotate ( unsigned int  x,
unsigned int  n 
)

Rotates the bits in x over by n bits.

This is the equivalent of the ROTATELEFT operation as described in the MD5 working memo. It takes the bits in x and circular shifts it over by n bits.

For more information see: The MD5 Message-Digest Algorithm.

Parameters:
[in]xthe variable with the bits
[in]nthe number of bits to shift left by.
Returns:
Rotated input
__device__ unsigned int F ( unsigned int  x,
unsigned int  y,
unsigned int  z 
)

The F scrambling function.

The F function in the MD5 technical memo scrambles three variables x, y, and z in the following way using bitwise logic:

(x & y) | ((~x) & z)

The resulting value is returned as an unsigned int.

For more information see: The MD5 Message-Digest Algorithm

Parameters:
[in]xSee the above formula
[in]ySee the above formula
[in]zSee the above formula
Returns:
F(x, y, z)
See also:
FF()
__device__ unsigned int G ( unsigned int  x,
unsigned int  y,
unsigned int  z 
)

The G scrambling function.

The G function in the MD5 technical memo scrambles three variables x, y, and z in the following way using bitwise logic:

(x & z) | ((~z) & y)

The resulting value is returned as an unsigned int.

For more information see: The MD5 Message-Digest Algorithm

Parameters:
[in]xSee the above formula
[in]ySee the above formula
[in]zSee the above formula
Returns:
G(x, y, z)
See also:
GG()
__device__ unsigned int H ( unsigned int  x,
unsigned int  y,
unsigned int  z 
)

The H scrambling function.

The H function in the MD5 technical memo scrambles three variables x, y, and z in the following way using bitwise logic:

(x ^ y ^ z)

The resulting value is returned as an unsigned int.

For more information see: The MD5 Message-Digest Algorithm

Parameters:
[in]xSee the above formula
[in]ySee the above formula
[in]zSee the above formula
Returns:
H(x, y, z)
See also:
HH()
__device__ unsigned int I ( unsigned int  x,
unsigned int  y,
unsigned int  z 
)

The I scrambling function.

The I function in the MD5 technical memo scrambles three variables x, y, and z in the following way using bitwise logic:

(y ^ (x | ~z))

The resulting value is returned as an unsigned int.

For more information see: The MD5 Message-Digest Algorithm

Parameters:
[in]xSee the above formula
[in]ySee the above formula
[in]zSee the above formula
Returns:
I(x, y, z)
See also:
II()
__device__ void FF ( uint4 *  td,
int  i,
uint4 *  Fr,
float  p,
unsigned int *  data 
)

The FF scrambling function.

The FF function in the MD5 technical memo is a wrapper for the F scrambling function as well as performing its own rotations using LeftRotate and swizzleShift. The variable td is the current scrambled digest which is passed along and scrambled using the current iteration i, the rotation information Fr, and the starting input data. p is kept as a constant of 2^32. The resulting value is stored in td.

For more information see: The MD5 Message-Digest Algorithm.

Parameters:
[in,out]tdThe current value of the digest stored as an uint4.
[in]iThe current iteration of the algorithm. This affects the values in data.
[in]FrThe current rotation order.
[in]pThe constant 2^32.
[in]dataThe starting input to MD5. Padded from setupInput().
Returns:
FF(input)
See also:
F()
swizzleShift()
leftRotate()
setupInput()
__device__ void GG ( uint4 *  td,
int  i,
uint4 *  Gr,
float  p,
unsigned int *  data 
)

The GG scrambling function.

The GG function in the MD5 technical memo is a wrapper for the G scrambling function as well as performing its own rotations using LeftRotate() and swizzleShift(). The variable td is the current scrambled digest which is passed along and scrambled using the current iteration i, the rotation information Gr, and the starting input data. p is kept as a constant of 2^32. The resulting value is stored in td.

For more information see: The MD5 Message-Digest Algorithm.

Parameters:
[in,out]tdThe current value of the digest stored as an uint4.
[in]iThe current iteration of the algorithm. This affects the values in data.
[in]GrThe current rotation order.
[in]pThe constant 2^32.
[in]dataThe starting input to MD5. Padded from setupInput().
Returns:
GG(input)
See also:
G()
swizzleShift()
leftRotate()
setupInput()
__device__ void HH ( uint4 *  td,
int  i,
uint4 *  Hr,
float  p,
unsigned int *  data 
)

The HH scrambling function.

The HH function in the MD5 technical memo is a wrapper for the H scrambling function as well as performing its own rotations using LeftRotate() and swizzleShift(). The variable td is the current scrambled digest which is passed along and scrambled using the current iteration i, the rotation information Hr, and the starting input data. p is kept as a constant of 2^32. The resulting value is stored in td.

For more information see: The MD5 Message-Digest Algorithm.

Parameters:
[in,out]tdThe current value of the digest stored as an uint4.
[in]iThe current iteration of the algorithm. This affects the values in data.
[in]HrThe current rotation order.
[in]pThe constant 2^32.
[in]dataThe starting input to MD5. Padded from setupInput().
Returns:
HH(input)
See also:
H()
swizzleShift()
leftRotate()
setupInput()
__device__ void II ( uint4 *  td,
int  i,
uint4 *  Ir,
float  p,
unsigned int *  data 
)

The II scrambling function.

The II function in the MD5 technical memo is a wrapper for the I scrambling function as well as performing its own rotations using LeftRotate() and swizzleShift(). The variable td is the current scrambled digest which is passed along and scrambled using the current iteration i, the rotation information Ir, and the starting input data. p is kept as a constant of 2^32. The resulting value is stored in td.

For more information see: The MD5 Message-Digest Algorithm.

Parameters:
[in,out]tdThe current value of the digest stored as an uint4.
[in]iThe current iteration of the algorithm. This affects the values in data.
[in]IrThe current rotation order.
[in]pThe constant 2^32.
[in]dataThe starting input to MD5. Padded from setupInput().
Returns:
II(input)
See also:
I()
swizzleShift()
leftRotate()
setupInput()
__device__ void setupInput ( unsigned int *  input,
unsigned int  seed 
)

Sets up the input array using information of seed, and threadIdx.

This function sets up the input array using a combination of the current thread's id and the user supplied seed.

For more information see: The MD5 Message-Digest Algorithm.

Parameters:
[out]inputThe array which will contain the initial values for all the scrambling functions.
[in]seedThe user supplied seed as an unsigned int.
See also:
FF()
GG()
HH()
II()
gen_randMD5()
template<class T , class traits >
__device__ void loadSharedChunkFromMem4 ( T *  s_out,
threadScan[2][4],
const T *  d_in,
int  numElements,
int  iDataOffset,
int &  ai,
int &  bi,
int &  aiDev,
int &  biDev 
)

Handles loading input s_data from global memory to shared memory (vec4 version)

Load a chunk of 8*blockDim.x elements from global memory into a shared memory array. Each thread loads two T4 elements (where T4 is, e.g. int4 or float4), computes the scan of those two vec4s in thread local arrays (in registers), and writes the two total sums of the vec4s into shared memory, where they will be cooperatively scanned with the other partial sums by all threads in the CTA.

Parameters:
[out]s_outThe output (shared) memory array
[out]threadScanIntermediate per-thread partial sums array (x2)
[in]d_inThe input (device) memory array
[in]numElementsThe number of elements in the array being scanned
[in]iDataOffsetthe offset of the input array in global memory for this thread block
[out]aiThe shared memory address for the thread's first element (returned for reuse)
[out]biThe shared memory address for the thread's second element (returned for reuse)
[out]aiDevThe device memory address for this thread's first element (returned for reuse)
[out]biDevThe device memory address for this thread's second element (returned for reuse)
template<class T , class traits >
__device__ void storeSharedChunkToMem4 ( T *  d_out,
threadScan[2][4],
T *  s_in,
int  numElements,
int  oDataOffset,
int  ai,
int  bi,
int  aiDev,
int  biDev 
)

Handles storing result s_data from shared memory to global memory (vec4 version)

Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory into a device memory array. Each thread stores reads two elements from shared memory, adds them to the intermediate sums computed in loadSharedChunkFromMem4(), and writes two T4 elements (where T4 is, e.g. int4 or float4) to global memory.

Parameters:
[out]d_outThe output (device) memory array
[in]threadScanIntermediate per-thread partial sums array (x2) (contents computed in loadSharedChunkFromMem4())
[in]s_inThe input (shared) memory array
[in]numElementsThe number of elements in the array being scanned
[in]oDataOffsetthe offset of the output array in global memory for this thread block
[in]aiThe shared memory address for the thread's first element (computed in loadSharedChunkFromMem4())
[in]biThe shared memory address for the thread's second element (computed in loadSharedChunkFromMem4())
[in]aiDevThe device memory address for this thread's first element (computed in loadSharedChunkFromMem4())
[in]biDevThe device memory address for this thread's second element (computed in loadSharedChunkFromMem4())
template<class T , class traits >
__device__ void loadSharedChunkFromMem2 ( T *  s_out,
threadScan[2][2],
const T *  d_in,
int  numElements,
int  iDataOffset,
int &  ai,
int &  bi,
int &  aiDev,
int &  biDev 
)

Handles loading input s_data from global memory to shared memory (vec4 version)

Load a chunk of 8*blockDim.x elements from global memory into a shared memory array. Each thread loads two T4 elements (where T4 is, e.g. int4 or float4), computes the scan of those two vec4s in thread local arrays (in registers), and writes the two total sums of the vec4s into shared memory, where they will be cooperatively scanned with the other partial sums by all threads in the CTA.

Parameters:
[out]s_outThe output (shared) memory array
[out]threadScanIntermediate per-thread partial sums array (x2)
[in]d_inThe input (device) memory array
[in]numElementsThe number of elements in the array being scanned
[in]iDataOffsetthe offset of the input array in global memory for this thread block
[out]aiThe shared memory address for the thread's first element (returned for reuse)
[out]biThe shared memory address for the thread's second element (returned for reuse)
[out]aiDevThe device memory address for this thread's first element (returned for reuse)
[out]biDevThe device memory address for this thread's second element (returned for reuse)
template<class T , class traits >
__device__ void storeSharedChunkToMem2 ( T *  d_out,
threadScan[2][2],
T *  s_in,
int  numElements,
int  oDataOffset,
int  ai,
int  bi,
int  aiDev,
int  biDev 
)

Handles storing result s_data from shared memory to global memory (vec4 version)

Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory into a device memory array. Each thread stores reads two elements from shared memory, adds them to the intermediate sums computed in loadSharedChunkFromMem4(), and writes two T4 elements (where T4 is, e.g. int4 or float4) to global memory.

Parameters:
[out]d_outThe output (device) memory array
[in]threadScanIntermediate per-thread partial sums array (x2) (contents computed in loadSharedChunkFromMem4())
[in]s_inThe input (shared) memory array
[in]numElementsThe number of elements in the array being scanned
[in]oDataOffsetthe offset of the output array in global memory for this thread block
[in]aiThe shared memory address for the thread's first element (computed in loadSharedChunkFromMem4())
[in]biThe shared memory address for the thread's second element (computed in loadSharedChunkFromMem4())
[in]aiDevThe device memory address for this thread's first element (computed in loadSharedChunkFromMem4())
[in]biDevThe device memory address for this thread's second element (computed in loadSharedChunkFromMem4())
template<class T , class traits , int maxlevel>
__device__ T warpscan ( val,
volatile T *  s_data 
)

Scan all warps of a CTA without synchronization.

The warp-scan algorithm breaks a block of data into warp-sized chunks, and scans the chunks independently with a warp of threads each. Because warps execute instructions in SIMD fashion, there is no need to synchronize in order to share data within a warp (only across warps). Also, in SIMD the most efficient algorithm is a step-efficient algorithm. Therefore, within each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the work-efficient tree-based algorithm described by Guy Blelloch [1990] that takes 2 * log(N) steps and is in general more complex to implement. Previous versions of CUDPP used the Blelloch algorithm. For current GPUs, the warp size is 32, so this takes five steps per warp.

Each thread is responsible for a single element of the array to be scanned. Each thread inputs a single value to the scan via val and returns its own scanned result element. The threads of each warp cooperate via the shared memory array s_data to scan WARP_SIZE elements.

Template parameter maxlevel allows this warpscan to be performed on partial warps. For example, if only the first 8 elements of each warp need to be scanned, then warpscan only performs log2(8)=3 steps rather than 5.

The computation uses 2 * WARP_SIZE elements of shared memory per warp to enable warps to offset beyond their input data and receive the identity element without using any branch instructions.

Note:
s_data is declared volatile here to prevent the compiler from optimizing away writes to shared memory, and ensure correct intrawarp communication in the absence of __syncthreads.
Returns:
The result of the warp scan for the current thread
Parameters:
[in]valThe current threads's input to the scan
[in,out]s_dataA pointer to a temporary shared array of 2*CTA_SIZE elements used to compute the warp scans
template<class T , class traits >
__device__ void scanWarps ( x,
y,
T *  s_data 
)

Perform a full CTA scan using the warp-scan algorithm.

As described in the comment for warpscan(), the warp-scan algorithm breaks a block of data into warp-sized chunks, and scans the chunks independently with a warp of threads each. To complete the scan, each warp j then writes its last element to element j of a temporary shared array. Then a single warp exclusive-scans these "warp sums". Finally, each thread adds the result of the warp sum scan to the result of the scan from the first pass.

Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan twice.

Parameters:
xThe first input value for the current thread
yThe second input value for the current thread
s_dataTemporary shared memory space of 2*CTA_SIZE elements for performing the scan
template<class T , class traits >
__device__ void scanCTA ( T *  s_data,
T *  d_blockSums,
unsigned int  blockSumIndex 
)

CTA-level scan routine; scans s_data in shared memory in each thread block.

This function is the main CTA-level scan function. It may be called by other CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements. Each thread is responsible for one element in each half of the input array.

Note:
This code is intended to be run on a CTA of 128 threads. Other sizes are untested.
Parameters:
[in]s_dataThe array to be scanned in shared memory
[out]d_blockSumsArray of per-block sums
[in]blockSumIndexLocation in d_blockSums to which to write this block's sum
template<class T , typename traits >
__device__ void loadForSegmentedScanSharedChunkFromMem4 ( T *  s_odata,
threadScan0[4],
threadScan1[4],
unsigned int &  threadFlag,
unsigned int *  s_oflags,
unsigned int *  s_oindices,
const T *  d_idata,
const unsigned int *  d_iflags,
int  numElements,
int  iDataOffset,
int &  ai,
int &  bi,
int &  aiDev,
int &  biDev 
) [inline]

Handles loading input s_data from global memory to shared memory (vec4 version)

Load a chunk of 8*blockDim.x elements from global memory into a shared memory array. Each thread loads two T4 elements (where T4 is, e.g. int4 or float4), computes the segmented scan of those two vec4s in thread local arrays (in registers), and writes the two total sums of the vec4s into shared memory, where they will be cooperatively scanned with the other partial sums by all threads in the CTA.

Parameters:
[out]s_odataThe output (shared) memory array
[out]threadScan0Intermediate per-thread partial sums array 1
[out]threadScan1Intermediate per-thread partial sums array 2
[out]threadFlagIntermediate array which holds 8 flags as follows Temporary register threadFlag0[4] - the flags for the first 4 elements read Temporary register threadFlag1[4] - the flags for the second 4 elements read Temporary register threadScanFlag0[4] - the inclusive OR-scan for the flags in threadFlag0[4] Temporary register threadScanFlag1[4] - the inclusive OR-scan for the flags in threadFlag1[4] We storing the 16 flags 32 bits of threadFlag Bits 0...3 contains threadFlag0[0]...threadFlag0[3] Bits 4...7 contains threadFlag1[0]...threadFlag1[3] Bits 8...11 contains threadScanFlag0[0]...threadScanFlag0[3] Bits 11...15 contains threadScanFlag1[0]...threadScanFlag1[3]
[out]s_oflagsOutput (shared) memory array of segment head flags
[out]s_oindicesOutput (shared) memory array of indices. If a flag for a position (1-based) is set then index for that position is the position, 0 otherwise.
[in]d_idataThe input (device) memory array
[in]d_iflagsThe input (device) memory array of segment head flags
[in]numElementsThe number of elements in the array being scanned
[in]iDataOffsetthe offset of the input array in global memory for this thread block
[out]aiThe shared memory address for the thread's first element (returned for reuse)
[out]biThe shared memory address for the thread's second element (returned for reuse)
[out]aiDevThe device memory address for this thread's first element (returned for reuse)
[out]biDevThe device memory address for this thread's second element (returned for reuse)
template<class T , class traits >
__device__ void storeForSegmentedScanSharedChunkToMem4 ( T *  d_odata,
threadScan0[4],
threadScan1[4],
unsigned int  threadFlag,
T *  s_idata,
unsigned int  numElements,
int  oDataOffset,
int  ai,
int  bi,
int  aiDev,
int  biDev 
) [inline]

Handles storing result s_data from shared memory to global memory (vec4 version)

Store a chunk of 8*blockDim.x elements from shared memory into a device memory array. Each thread stores reads two elements from shared memory, adds them while respecting segment bouldaries, to the intermediate sums computed in loadForSegmentedScanSharedChunkFromMem4(), and writes two T4 elements (where T4 is, e.g. int4 or float4) to global memory.

Parameters:
[out]d_odataThe output (device) memory array
[out]threadScan0Intermediate per-thread partial sums array 1 (contents computed in loadForSegmentedScanSharedChunkFromMem4())
[in]threadScan1Intermediate per-thread partial sums array 2 (contents computed in loadForSegmentedScanSharedChunkFromMem4())
[in]threadFlagVarious flags that loadForSegmentedScanSharedChunkFromMem4() needs to pass
[in]s_idataThe input (shared) memory array
[in]numElementsThe number of elements in the array being scanned
[in]oDataOffsetthe offset of the output array in global memory for this thread block
[in]aiThe shared memory address for the thread's first element (computed in loadForSegmentedScanSharedChunkFromMem4())
[in]biThe shared memory address for the thread's second element (computed in loadForSegmentedScanSharedChunkFromMem4())
[in]aiDevThe device memory address for this thread's first element (computed in loadForSegmentedScanSharedChunkFromMem4())
[in]biDevThe device memory address for this thread's second element (computed in loadForSegmentedScanSharedChunkFromMem4())
template<class T , class traits >
__device__ void segmentedScanCTA ( T *  s_data,
unsigned int *  s_flags,
unsigned int *  s_indices,
T *  d_blockSums = 0,
unsigned int *  d_blockFlags = 0,
unsigned int *  d_blockIndices = 0 
)

CTA-level segmented scan routine;.

Performs segmented scan on s_data in shared memory in each thread block with head flags in s_flags (s_tflags is a read-write copy of the head flags which are modified).

This function is the main CTA-level segmented scan function. It may be called by other CUDA __global__ or __device__ functions.

Note:
This code is intended to be run on a CTA of 128 threads. Other sizes are untested.
Parameters:
[in]s_dataArray to be scanned in shared memory
[in]s_flagsRead-only version of flags in shared memory
[in]s_indicesTemporary read-write indices array
[out]d_blockSumsArray of per-block sums
[out]d_blockFlagsArray of per-block OR-reduction of flags
[out]d_blockIndicesArray of per-block min-reduction of indices
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines