Compact Functions
template<class T , bool isBackward>
__global__ void	compactData (T d_out, size_t d_numValidElements, const unsigned int d_indices, const unsigned int d_isValid, const T *d_in, unsigned int numElements)
	Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray().
RadixSort Functions
typedef unsigned int	uint
__global__ void	emptyKernel ()
	And empty kernel used to reset CTA issue hardware.
__global__ void	flipFloats (uint *values, uint numValues)
	Does special binary arithmetic before sorting floats.
__global__ void	unflipFloats (uint *values, uint numValues)
	Undoes the flips from flipFloats.
template<bool flip>
__global__ void	radixSortSingleWarp (uint keys, uint values, uint numElements)
	Optimization for sorts of WARP_SIZE or fewer elements.
template<bool flip>
__global__ void	radixSortSingleWarpKeysOnly (uint *keys, uint numElements)
	Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void	radixSortBlocks (uint4 keysOut, uint4 valuesOut, uint4 keysIn, uint4 valuesIn, uint numElements, uint totalBlocks)
	sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
template<uint startbit, bool fullBlocks, bool loop>
__global__ void	findRadixOffsets (uint2 keys, uint counters, uint *blockOffsets, uint numElements, uint totalBlocks)
	Computes the number of keys of each radix in each block stores offset.
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void	reorderData (uint outKeys, uint outValues, uint2 keys, uint2 values, uint blockOffsets, uint offsets, uint *sizes, uint numElements, uint totalBlocks)
	Reorders data in the global array.
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void	radixSortBlocksKeysOnly (uint4 keysOut, uint4 keysIn, uint numElements, uint totalBlocks)
	Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements.
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void	reorderDataKeysOnly (uint outKeys, uint2 keys, uint blockOffsets, uint offsets, uint *sizes, uint numElements, uint totalBlocks)
	Reorders data in the global array.
Rand Functions
__global__ void	gen_randMD5 (uint4 *d_out, size_t numElements, unsigned int seed)
	The main MD5 generation algorithm.
Reduce Functions
template<typename T , class Oper , unsigned int blockSize, bool nIsPow2>
__global__ void	reduce (T odata, const T idata, unsigned int n)
	Main reduction kernel.
Scan Functions
template<class T , class traits >
__global__ void	scan4 (T d_out, const T d_in, T *d_blockSums, int numElements, unsigned int dataRowPitch, unsigned int blockSumRowPitch)
	Main scan kernel.
Segmented scan Functions
template<class T , class traits >
__global__ void	segmentedScan4 (T d_odata, const T d_idata, const unsigned int d_iflags, unsigned int numElements, T d_blockSums=0, unsigned int d_blockFlags=0, unsigned int d_blockIndices=0)
	Main segmented scan kernel.
Sparse Matrix-Vector multiply Functions
template<class T , bool isFullBlock>
__global__ void	sparseMatrixVectorFetchAndMultiply (unsigned int d_flags, T d_prod, const T d_A, const T d_x, const unsigned int *d_indx, unsigned int numNZElts)
	Fetch and multiply kernel.
__global__ void	sparseMatrixVectorSetFlags (unsigned int d_flags, const unsigned int d_rowindx, unsigned int numRows)
	Set Flags kernel.
template<class T >
__global__ void	yGather (T d_y, const T d_prod, const unsigned int *d_rowFindx, unsigned int numRows)
	Gather final y values kernel.
Tridiagonal functions
template<class T >
__global__ void	crpcrKernel (T d_a, T d_b, T d_c, T d_d, T *d_x, unsigned int systemSizeOriginal, unsigned int iterations)
	Hybrid CR-PCR Tridiagonal linear system solver (CRPCR)
Vector Functions
CUDA kernel methods for basic operations on vectors.
template<class T >
__global__ void	vectorAddConstant (T *d_vector, T constant, int n, int baseIndex)
	Adds a constant value to all values in the input d_vector.
template<class T >
__global__ void	vectorAddUniform (T d_vector, const T d_uniforms, int numElements, int blockOffset, int baseIndex)
	Add a uniform value to each data element of an array.
template<typename T >
__global__ void	vectorAddUniform2 (T g_data, T uniforms, int n, int eltsPerBlock)
template<class T , class Oper , int elementsPerThread, bool fullBlocks>
__global__ void	vectorAddUniform4 (T d_vector, const T d_uniforms, int numElements, int vectorRowPitch, int uniformRowPitch, int blockOffset, int baseIndex)
	Add a uniform value to each data element of an array (vec4 version)
template<class T >
__global__ void	vectorAddVector (T d_vectorA, const T d_vectorB, int numElements, int baseIndex)
	Adds together two vectors.
template<class T , class Oper , bool isLastBlockFull>
__global__ void	vectorSegmentedAddUniform4 (T d_vector, const T d_uniforms, const unsigned int *d_maxIndices, unsigned int numElements, int blockOffset, int baseIndex)
	Add a uniform value to data elements of an array (vec4 version)
template<class T , class Oper , bool isLastBlockFull>
__global__ void	vectorSegmentedAddUniformToRight4 (T d_vector, const T d_uniforms, const unsigned int *d_minIndices, unsigned int numElements, int blockOffset, int baseIndex)
	Add a uniform value to data elements of an array (vec4 version)

Detailed Description

The CUDPP Kernel-Level API contains functions that run on the GPU device across a grid of Cooperative Thread Array (CTA, aka Thread Block). These kernels are declared __global__ so that they must be invoked from host (CPU) code. They generally invoke GPU __device__ routines in the CUDPP CTA-Level API. Kernel-Level API functions are used by CUDPP Application-Level functions to implement their functionality.

Function Documentation

template<class T , bool isBackward>

__global__ void compactData	(	T *	d_out,
		size_t *	d_numValidElements,
		const unsigned int *	d_indices,
		const unsigned int *	d_isValid,
		const T *	d_in,
		unsigned int	numElements
	)

Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray().

Parameters:

[out]	d_out	Output array of compacted values.
[out]	d_numValidElements	The number of elements in d_in with valid flags set to 1.
[in]	d_indices	Positions where non-null elements will go in d_out.
[in]	d_isValid	Flags indicating valid (1) and invalid (0) elements. Only valid elements will be copied to d_out.
[in]	d_in	The input array
[in]	numElements	The length of the d_in in elements.

__global__ void flipFloats	(	uint *	values,
		uint	numValues
	)

Does special binary arithmetic before sorting floats.

Uses floatFlip function to flip bits.

Parameters:

[in,out]	values	Values to be manipulated
[in]	numValues	Number of values to be flipped

__global__ void unflipFloats	(	uint *	values,
		uint	numValues
	)

Undoes the flips from flipFloats.

Uses floatUnflip function to unflip bits.

Parameters:

[in,out]	values	Values to be manipulated
[in]	numValues	Number of values to be unflipped

template<bool flip>

__global__ void radixSortSingleWarp	(	uint *	keys,
		uint *	values,
		uint	numElements
	)

Optimization for sorts of WARP_SIZE or fewer elements.

Parameters:

[in,out]	keys	Keys to be sorted.
[in,out]	values	Associated values to be sorted (through keys).
[in]	numElements	Number of elements in the sort.

template<bool flip>

__global__ void radixSortSingleWarpKeysOnly	(	uint *	keys,
		uint	numElements
	)

Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.

Parameters:

[in,out]	keys	Keys to be sorted
[in]	numElements	Total number of elements to be sorted

template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>

__global__ void radixSortBlocks	(	uint4 *	keysOut,
		uint4 *	valuesOut,
		uint4 *	keysIn,
		uint4 *	valuesIn,
		uint	numElements,
		uint	totalBlocks
	)

sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements

The radix sort is done in two stages. This stage calls radixSortBlock on each block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "flip" is used to only compile in the float flip code when float keys are used. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters:

[out]	keysOut	Output of sorted keys
[out]	valuesOut	Output of associated values
[in]	keysIn	Input of unsorted keys in GPU
[in]	valuesIn	Input of associated input values
[in]	numElements	Total number of elements to sort
[in]	totalBlocks	The number of blocks of data to sort

template<uint startbit, bool fullBlocks, bool loop>

__global__ void findRadixOffsets	(	uint2 *	keys,
		uint *	counters,
		uint *	blockOffsets,
		uint	numElements,
		uint	totalBlocks
	)

Computes the number of keys of each radix in each block stores offset.

Given an array with blocks sorted according to a 4-bit radix group, each block counts the number of keys that fall into each radix in the group, and finds the starting offset of each radix in the block. It then writes the radix counts to the counters array, and the starting offsets to the blockOffsets array.

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters:

[in]	keys	Input keys
[out]	counters	Radix count for each block
[out]	blockOffsets	The offset address for each block
[in]	numElements	Total number of elements
[in]	totalBlocks	Total number of blocks

template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>

__global__ void reorderData	(	uint *	outKeys,
		uint *	outValues,
		uint2 *	keys,
		uint2 *	values,
		uint *	blockOffsets,
		uint *	offsets,
		uint *	sizes,
		uint	numElements,
		uint	totalBlocks
	)

Reorders data in the global array.

reorderData shuffles data in the array globally after the radix offsets have been found. On compute version 1.1 and earlier GPUs, this code depends on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).

On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures that all writes are coalesced using extra work in the kernel. On later GPUs coalescing rules have been relaxed, so this extra overhead hurts performance. On these GPUs we set manualCoalesce=false and directly store the results.

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters:

[out]	outKeys	Output of sorted keys
[out]	outValues	Output of associated values
[in]	keys	Input of unsorted keys in GPU
[in]	values	Input of associated input values
[in]	blockOffsets	The offset address for each block
[in]	offsets	Address of each radix within each block
[in]	sizes	Number of elements in a block
[in]	numElements	Total number of elements
[in]	totalBlocks	Total number of data blocks to process

Todo:: Args that are const below should be prototyped as const

template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>

__global__ void radixSortBlocksKeysOnly	(	uint4 *	keysOut,
		uint4 *	keysIn,
		uint	numElements,
		uint	totalBlocks
	)

Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements.

The radix sort is done in two stages. This stage calls radixSortBlock on each block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "flip" is used to only compile in the float flip code when float keys are used. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters:

[out]	keysOut	Output of sorted keys GPU main memory
[in]	keysIn	Input of unsorted keys in GPU main memory
[in]	numElements	Total number of elements to sort
[in]	totalBlocks	Total number of blocks to sort

template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>

__global__ void reorderDataKeysOnly	(	uint *	outKeys,
		uint2 *	keys,
		uint *	blockOffsets,
		uint *	offsets,
		uint *	sizes,
		uint	numElements,
		uint	totalBlocks
	)

Reorders data in the global array.

reorderDataKeysOnly shuffles data in the array globally after the radix offsets have been found. On compute version 1.1 and earlier GPUs, this code depends on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).

On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures that all writes are coalesced using extra work in the kernel. On later GPUs coalescing rules have been relaxed, so this extra overhead hurts performance. On these GPUs we set manualCoalesce=false and directly store the results.

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters:

[out]	outKeys	Output result of reorderDataKeysOnly()
[in]	keys	Keys to be reordered
[in]	blockOffsets	Start offset for each block
[in]	offsets	Offset of each radix within each block
[in]	sizes	Number of elements in a block
[in]	numElements	Total number of elements
[in]	totalBlocks	Total number of blocks

__global__ void gen_randMD5	(	uint4 *	d_out,
		size_t	numElements,
		unsigned int	seed
	)

The main MD5 generation algorithm.

This function runs the MD5 hashing random number generator. It generates MD5 hashes, and uses the output as randomized bits. To repeatedly call this function, always call cudppRandSeed() first to set a new seed or else the output may be the same due to the deterministic nature of hashes. gen_randMD5 generates 128 random bits per thread. Therefore, the parameter d_out is expected to be an array of type uint4 with numElements indicies.

Parameters:

[out]	d_out	the output array of type uint4.
[in]	numElements	the number of elements in d_out
[in]	seed	the random seed used to vary the output

See also:: launchRandMD5Kernel()

template<typename T , class Oper , unsigned int blockSize, bool nIsPow2>

__global__ void reduce	(	T *	odata,
		const T *	idata,
		unsigned int	n
	)

Main reduction kernel.

This reduction kernel adds multiple elements per thread sequentially, and then the threads work together to produce a block sum in shared memory. The code is optimized using warp-synchronous programming to eliminate unnecessary barrier synchronization. Performing sequential work in each thread before performing the log(N) parallel summation reduces the overall cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). (Brent's Theorem optimization)

Parameters:

[out]	odata	The output data pointer. Each block writes a single output element.
[in]	idata	The input data pointer.
[in]	n	The number of elements to be reduced.

template<class T , class traits >

__global__ void scan4	(	T *	d_out,
		const T *	d_in,
		T *	d_blockSums,
		int	numElements,
		unsigned int	dataRowPitch,
		unsigned int	blockSumRowPitch
	)

Main scan kernel.

This __global__ device function performs one level of a multiblock scan on an arbitrary-dimensioned array in d_in, returning the result in d_out (which may point to the same array). The same function may be used for single or multi-row scans. To perform a multirow scan, pass the width of each row of the input row (in elements) in dataRowPitch, and the width of the rows of d_blockSums (in elements) in blockSumRowPitch, and invoke with a thread block grid with height greater than 1.

This function peforms one level of a recursive, multiblock scan. At the app level, this function is called by cudppScan and cudppMultiScan and used in combination with vectorAddUniform4() to produce a complete scan.

Template parameter T is the datatype of the array to be scanned. Template parameter traits is the ScanTraits struct containing compile-time options for the scan, such as whether it is forward or backward, exclusive or inclusive, multi- or single-row, etc.

Parameters:

[out]	d_out	The output (scanned) array
[in]	d_in	The input array to be scanned
[out]	d_blockSums	The array of per-block sums
[in]	numElements	The number of elements to scan
[in]	dataRowPitch	The width of each row of d_in in elements (for multi-row scans)
[in]	blockSumRowPitch	The with of each row of d_blockSums in elements (for multi-row scans)

template<class T , class traits >

__global__ void segmentedScan4	(	T *	d_odata,
		const T *	d_idata,
		const unsigned int *	d_iflags,
		unsigned int	numElements,
		T *	d_blockSums = `0`,
		unsigned int *	d_blockFlags = `0`,
		unsigned int *	d_blockIndices = `0`
	)

Main segmented scan kernel.

This __global__ device function performs one level of a multiblock segmented scan on an one-dimensioned array in d_idata, returning the result in d_odata (which may point to the same array).

This function performs one level of a recursive, multiblock scan. At the app level, this function is called by cudppSegmentedScan and used in combination with either vectorSegmentedAddUniform4() (forward) or vectorSegmentedAddUniformToRight4() (backward) to produce a complete segmented scan.

Template parameter T is the datatype of the array to be scanned. Template parameter traits is the SegmentedScanTraits struct containing compile-time options for the segmented scan, such as whether it is forward or backward, inclusive or exclusive, etc.

Parameters:

[out]	d_odata	The output (scanned) array
[in]	d_idata	The input array to be scanned
[in]	d_iflags	The input array of flags
[out]	d_blockSums	The array of per-block sums
[out]	d_blockFlags	The array of per-block OR-reduction of flags
[out]	d_blockIndices	The array of per-block min-reduction of indices
[in]	numElements	The number of elements to scan

template<class T , bool isFullBlock>

__global__ void sparseMatrixVectorFetchAndMultiply	(	unsigned int *	d_flags,
		T *	d_prod,
		const T *	d_A,
		const T *	d_x,
		const unsigned int *	d_indx,
		unsigned int	numNZElts
	)

Fetch and multiply kernel.

This __global__ device function takes an element from the vector d_A, finds its column in d_indx and multiplies the element from d_A with its corresponding (that is having the same row) element in d_x and stores the resulting product in d_prod. It also sets all the elements of d_flags to 0.

Template parameter T is the datatype of the matrix A and x.

Parameters:

[out]	d_flags	The output flags array
[out]	d_prod	The output products array
[in]	d_A	The input matrix A
[in]	d_x	The input array x
[in]	d_indx	The input array of column indices for each element in A
[in]	numNZElts	The number of non-zero elements in matrix A

__global__ void sparseMatrixVectorSetFlags	(	unsigned int *	d_flags,
		const unsigned int *	d_rowindx,
		unsigned int	numRows
	)

Set Flags kernel.

This __global__ device function takes an element from the vector d_rowindx, and sets the corresponding position in d_flags to 1

Parameters:

[out]	d_flags	The output flags array
[in]	d_rowindx	The starting index of each row in the "flattened" version of matrix A
[in]	numRows	The number of rows in matrix A

template<class T >

__global__ void yGather	(	T *	d_y,
		const T *	d_prod,
		const unsigned int *	d_rowFindx,
		unsigned int	numRows
	)

Gather final y values kernel.

This __global__ device function takes an element from the vector d_rowFindx, which for each row gives the index of the last element of that row, reads the corresponding position in d_prod and write it in d_y

Template parameter T is the datatype of the matrix A and x.

Parameters:

[out]	d_y	The output result array
[in]	d_prod	The input products array (which now contains sums for each row)
[in]	d_rowFindx	The starting index of each row in the "flattened" version of matrix A
[in]	numRows	The number of rows in matrix A

template<class T >

__global__ void crpcrKernel	(	T *	d_a,
		T *	d_b,
		T *	d_c,
		T *	d_d,
		T *	d_x,
		unsigned int	systemSizeOriginal,
		unsigned int	iterations
	)

Hybrid CR-PCR Tridiagonal linear system solver (CRPCR)

This kernel solves a tridiagonal linear system using a hybrid CR-PCR algorithm. The solver first reduces the system size using cyclic reduction, then solves the intermediate system using parallel cyclic reduction to reduce shared memory bank conflicts and algorithmic steps, and finally switches back to cyclic reduction to solve all unknowns.

Parameters:

[out]	d_x	Solution vector
[in]	d_a	Lower diagonal
[in]	d_b	Main diagonal
[in]	d_c	Upper diagonal
[in]	d_d	Right hand side
[in]	systemSizeOriginal	The size of each system
[in]	iterations	The computed number of PCR iterations

template<class T >

__global__ void vectorAddConstant	(	T *	d_vector,
		T	constant,
		int	n,
		int	baseIndex
	)

Adds a constant value to all values in the input d_vector.

Each thread adds two pairs of elements.

Todo:: Test this function -- it is currently not yet used.

Parameters:

[in,out]	d_vector	The array of elements to be modified
[in]	constant	The constant value to be added to elements of d_vector
[in]	n	The number of elements in the d_vector to be modified
[in]	baseIndex	An optional offset to the beginning of the elements in the input array to be processed

template<class T >

__global__ void vectorAddUniform	(	T *	d_vector,
		const T *	d_uniforms,
		int	numElements,
		int	blockOffset,
		int	baseIndex
	)

Add a uniform value to each data element of an array.

This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds two pairs of values.

Parameters:

[out]	d_vector	The d_vector whose values will have the uniform added
[in]	d_uniforms	The array of uniform values (one per CTA)
[in]	numElements	The number of elements in d_vector to process
[in]	blockOffset	an optional offset to the beginning of this block's data.
[in]	baseIndex	an optional offset to the beginning of the array within d_vector.

template<class T , class Oper , int elementsPerThread, bool fullBlocks>

__global__ void vectorAddUniform4	(	T *	d_vector,
		const T *	d_uniforms,
		int	numElements,
		int	vectorRowPitch,
		int	uniformRowPitch,
		int	blockOffset,
		int	baseIndex
	)

Add a uniform value to each data element of an array (vec4 version)

This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds the uniform value to eight values in d_vector.

Parameters:

[out]	d_vector	The d_vector whose values will have the uniform added
[in]	d_uniforms	The array of uniform values (one per CTA)
[in]	numElements	The number of elements in d_vector to process
[in]	vectorRowPitch	For 2D arrays, the pitch (in elements) of the rows of d_vector.
[in]	uniformRowPitch	For 2D arrays, the pitch (in elements) of the rows of d_uniforms.
[in]	blockOffset	an optional offset to the beginning of this block's data.
[in]	baseIndex	an optional offset to the beginning of the array within d_vector.

template<class T >

__global__ void vectorAddVector	(	T *	d_vectorA,
		const T *	d_vectorB,
		int	numElements,
		int	baseIndex
	)

Adds together two vectors.

Each thread adds two pairs of elements.

Todo:: Test this function -- it is currently not yet used.

Parameters:

[out]	d_vectorA	The left operand array and the result
[in]	d_vectorB	The right operand array
[in]	numElements	The number of elements in the vectors to be added.
[in]	baseIndex	An optional offset to the beginning of the elements in the input arrays to be processed

template<class T , class Oper , bool isLastBlockFull>

__global__ void vectorSegmentedAddUniform4	(	T *	d_vector,
		const T *	d_uniforms,
		const unsigned int *	d_maxIndices,
		unsigned int	numElements,
		int	blockOffset,
		int	baseIndex
	)

Add a uniform value to data elements of an array (vec4 version)

This function reads one value per CTA from d_uniforms into shared memory and adds that value to values "owned" by the CTA in d_vector. The uniform value is added to only those values "owned" by the CTA which have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX it adds the uniform to all values "owned" by the CTA. Each thread adds the uniform value to eight values in d_vector.

Parameters:

[out]	d_vector	The d_vector whose values will have the uniform added
[in]	d_uniforms	The array of uniform values (one per CTA)
[in]	d_maxIndices	The array of maximum indices (one per CTA). This is index upto which the uniform would be added. If this is UINT_MAX the uniform is added to all elements of the CTA. This index is 1-based.
[in]	numElements	The number of elements in d_vector to process
[in]	blockOffset	an optional offset to the beginning of this block's data.
[in]	baseIndex	an optional offset to the beginning of the array within d_vector.

template<class T , class Oper , bool isLastBlockFull>

__global__ void vectorSegmentedAddUniformToRight4	(	T *	d_vector,
		const T *	d_uniforms,
		const unsigned int *	d_minIndices,
		unsigned int	numElements,
		int	blockOffset,
		int	baseIndex
	)

Add a uniform value to data elements of an array (vec4 version)

This function reads one value per CTA from d_uniforms into shared memory and adds that value to values "owned" by the CTA in d_vector. The uniform value is added to only those values "owned" by the CTA which have an index greater than d_minIndex. If d_minIndex for that CTA is 0 it adds the uniform to all values "owned" by the CTA. Each thread adds the uniform value to eight values in d_vector.

Parameters:

[out]	d_vector	The d_vector whose values will have the uniform added
[in]	d_uniforms	The array of uniform values (one per CTA)
[in]	d_minIndices	The array of minimum indices (one per CTA). The uniform is added to the right of this index (that is, to every index that is greater than this index). If this is 0, the uniform is added to all elements of the CTA. This index is 1-based to prevent overloading of what 0 means. In our case it means absence of a flag. But if the first element of a CTA has flag the index will also be 0. Hence we use 1-based indices so the index is 1 in the latter case.
[in]	numElements	The number of elements in d_vector to process
[in]	blockOffset	an optional offset to the beginning of this block's data.
[in]	baseIndex	an optional offset to the beginning of the array within d_vector.

Compact Functions

RadixSort Functions

Rand Functions

Reduce Functions

Scan Functions

Segmented scan Functions

Sparse Matrix-Vector multiply Functions

Tridiagonal functions

Vector Functions

Detailed Description

Function Documentation