Functions
__global__ void	alignedOffsets (unsigned int numSpaces, unsigned int d_address, unsigned char *d_stringVals, unsigned char termC, unsigned int numElements, unsigned int stringSize)
	Calculate the number of spaces required for each string to align the string array. More...

__global__ void	alignString (unsigned int packedStrings, unsigned char d_stringVals, unsigned int packedAddress, unsigned int address, unsigned int numElements, unsigned int stringArrayLength, unsigned char termC)
	Packs strings into unsigned ints to be sorted later. These packed strings will also be aligned. More...

__global__ void	createKeys (unsigned int d_keys, unsigned int packedStrings, unsigned int *packedAddress, unsigned int numElements)
	Create keys (first four characters stuffed in an uint) from the addresses to the strings, and the string array. More...

__global__ void	unpackAddresses (unsigned int packedAddress, unsigned int packedAddressRef, unsigned int address, unsigned int addressRef, size_t numElements)
	Converts addresses from packed (unaligned) form to unpacked and unaligned form Resulting aligned strings begin in our string array packed in an unsigned int and aligned such that each string begins at the start of a uint (divisible by 4) More...

template<class T , int depth>
__global__ void	blockWiseStringSort (T A_keys, T A_address, T *stringVals, int blockSize, int totalSize, unsigned int stringSize, unsigned char termC)
	Does an initial blockSort based on the size of our partition (limited by shared memory size) More...

template<class T , int depth>
__global__ void	simpleStringMerge (T A_keys, T A_keys_out, T A_values, T A_values_out, T *stringValues, int sizePerPartition, int size, int step, int stringSize, unsigned char termC)
	Merges two independent sets. Each CUDA block works on two partitions of data without cooperating. More...

template<class T >
__global__ void	findMultiPartitions (T A_keys, T A_address, T stringValues, int splitsPP, int numPartitions, int partitionSize, unsigned int partitionBeginA, unsigned int partitionSizesA, unsigned int partitionBeginB, unsigned int *partitionSizesB, size_t size, size_t stringSize, unsigned char termC)
	For our multiMerge kernels we need to divide our partitions into smaller partitions. This kernel breaks up a set of partitions into splitsPP*numPartitions subpartitions. More...

template<class T , int depth>
__global__ void	stringMergeMulti (T A_keys, T A_keys_out, T A_values, T A_values_out, T stringValues, int subPartitions, int numBlocks, unsigned int partitionBeginA, unsigned int partitionSizeA, unsigned int partitionBeginB, unsigned int *partitionSizeB, int entirePartitionSize, int step, size_t size, size_t stringSize, unsigned char termC)
	Main merge kernel where multiple CUDA blocks cooperate to merge a partition(s) More...

Compact Functions
template<class T , bool isBackward>
__global__ void	compactData (T d_out, size_t d_numValidElements, const unsigned int d_indices, const unsigned int d_isValid, const T *d_in, unsigned int numElements)
	Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray(). More...

Compress Functions
typedef unsigned int	uint

typedef unsigned char	uchar

typedef unsigned short	ushort

__global__ void	bwt_compute_final_kernel (const uchar d_bwtIn, const uint d_values, int d_bwtIndex, uchar d_bwtOut, uint numElements, uint tThreads)
	Compute final BWT. More...

template<class T , int depth>
__global__ void	stringMergeMulti (T A_keys, T A_keys_out, T A_values, T A_values_out, T stringValues, int subPartitions, int numBlocks, int partitionBeginA, int partitionSizeA, int partitionBeginB, int *partitionSizeB, int entirePartitionSize, size_t numElements)
	Multi merge. More...

template<class T >
__global__ void	findMultiPartitions (T A, int splitsPP, int numPartitions, int partitionSize, int partitionBeginA, int partitionSizesA, int partitionBeginB, int *partitionSizesB, int sizeA)
	Merges the indices for the "upper" block (right block) More...

template<class T , int depth>
__global__ void	simpleStringMerge (T A_keys, T A_keys_out, T A_values, T A_values_out, T stringValues, int sizePerPartition, int size, T stringValues2, size_t numElements)
	Simple merge. More...

template<class T , int depth>
__global__ void	blockWiseStringSort (T A_keys, T A_address, const T stringVals, T stringVals2, int blockSize, size_t numElements)
	Sorts blocks of data of size blockSize. More...

__global__ void	bwt_keys_construct_kernel (uchar4 d_bwtIn, uint d_bwtInRef, uint d_keys, uint d_values, uint *d_bwtInRef2, uint tThreads)
	Massage input to set up for merge sort. More...

__global__ void	mtf_reduction_kernel (const uchar d_mtfIn, uchar d_lists, ushort *d_list_sizes, uint nLists, uint offset, uint numElements)
	First stage in MTF (Reduction) More...

__global__ void	mtf_GLreduction_kernel (uchar d_lists, ushort d_list_sizes, uint offset, uint tThreads, uint nLists)
	Second stage in MTF (Global reduction) More...

__global__ void	mtf_GLdownsweep_kernel (uchar d_lists, ushort d_list_sizes, uint offset, uint lastLevel, uint nLists, uint tThreads)
	Third stage in MTF (Global downsweep) More...

__global__ void	mtf_localscan_lists_kernel (const uchar d_mtfIn, uchar d_mtfOut, uchar d_lists, ushort d_list_sizes, uint nLists, uint offset, uint numElements)
	Compute final MTF lists and final MTF output. More...

__global__ void	huffman_build_histogram_kernel (uint d_input, uint d_histograms, uint numElements)
	Compute 256-entry histogram. More...

__global__ void	histo_kernel (uchar d_input, uint d_histograms, uint numElements)

__global__ void	huffman_build_tree_kernel (const uchar d_input, uchar d_huffCodesPacked, uint d_huffCodeLocations, uchar d_huffCodeLengths, uint d_histograms, uint d_histogram, uint d_nCodesPacked, uint d_totalEncodedSize, uint histBlocks, uint numElements)
	Build Huffman tree/codes. More...

__global__ void	huffman_kernel_en (uchar4 d_input, uchar d_codes, uint d_code_locations, uchar d_huffCodeLengths, encoded *d_encoded, uint nCodesPacked, uint nThreads)
	Perform parallel Huffman encoding. More...

__global__ void	huffman_datapack_kernel (encoded d_encoded, uint d_encodedData, uint d_totalEncodedSize, uint d_eOffsets)
	Pack together encoded blocks. More...

ListRank Functions
typedef unsigned int	uint

typedef unsigned char	uchar

typedef unsigned short	ushort

template<typename T >
__global__ void	list_rank_kernel_soa_1 (T d_ranked_values, const T d_unranked_values, const int d_ping, int d_pong, int *d_start_indices, int step, int head, int numElts)
	Use pointer jumping to rank values. After ranking the values, calculate the next set of indices. The number of values ranked doubles at each kernel call. Called by listRank(). More...

template<typename T >
__global__ void	list_rank_kernel_soa_2 (T d_ranked_values, const T d_unranked_values, const int d_pong, const int d_start_indices, int head, int numElts)
	After pointer jumping is finished and all threads are able to rank values, ranking continues serially. Each thread ranks values until all values are ranked. Called by listRank(). More...

MergeSort Functions
typedef unsigned int	uint

template<class T >
__global__ void	simpleCopy (T A_keys_dev, unsigned int A_vals_dev, T A_keys_out_dev, unsigned int A_vals_out_dev, int offset, int numElementsToCopy)
	Copies unused portions of arrays in our ping-pong strategy. More...

template<class T , int depth>
__global__ void	blockWiseSort (T A_keys, unsigned int A_values, int blockSize, size_t totalSize)
	Sorts blocks of data of size blockSize. More...

template<class T , int depth>
__global__ void	simpleMerge_lower (T A_keys, unsigned int A_values, T A_keys_out, unsigned int A_values_out, int sizePerPartition, int size)
	Merges the indices for the "lower" block (left block) More...

template<class T , int depth>
__global__ void	simpleMerge_higher (T A_keys, unsigned int A_values, T A_keys_out, unsigned int A_values_out, int sizePerPartition, int size)
	Merges the indices for the "upper" block (right block) More...

template<class T >
__global__ void	findMultiPartitions (T A, int splitsPP, int numPartitions, int partitionSize, int partitionBeginA, int *partitionSizesA, int sizeA)
	Merges the indices for the "upper" block (right block) More...

template<class T , int depth>
__global__ void	mergeMulti_lower (T A_keys_out, unsigned int A_vals_out, T A_keys, unsigned int A_vals, int subPartitions, int numBlocks, int partitionBeginA, int partitionSizeA, int entirePartitionSize, int sizeA)
	Blocks cooperatively Merge two partitions for the indices in the "lower" block (left block) More...

template<class T , int depth>
__global__ void	mergeMulti_higher (T A_keys_out, unsigned int A_vals_out, T A_keys, unsigned int A_vals, int subPartitions, int numBlocks, int partitionBeginA, int partitionSizeA, int entirePartitionSize, int sizeA)
	Blocks cooperatively Merge two partitions for the indices in the "upper" block (right block) More...

RadixSort Functions
typedef unsigned int	uint

__global__ void	emptyKernel ()
	And empty kernel used to reset CTA issue hardware.

__global__ void	flipFloats (uint *values, uint numValues)
	Does special binary arithmetic before sorting floats. More...

__global__ void	unflipFloats (uint *values, uint numValues)
	Undoes the flips from flipFloats. More...

template<bool flip>
__global__ void	radixSortSingleWarp (uint keys, uint values, uint numElements)
	Optimization for sorts of WARP_SIZE or fewer elements. More...

template<bool flip>
__global__ void	radixSortSingleWarpKeysOnly (uint *keys, uint numElements)
	Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version. More...

template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void	radixSortBlocks (uint4 keysOut, uint4 valuesOut, uint4 keysIn, uint4 valuesIn, uint numElements, uint totalBlocks)
	sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements More...

template<uint startbit, bool fullBlocks, bool loop>
__global__ void	findRadixOffsets (uint2 keys, uint counters, uint *blockOffsets, uint numElements, uint totalBlocks)
	Computes the number of keys of each radix in each block stores offset. More...

template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void	reorderData (uint outKeys, uint outValues, uint2 keys, uint2 values, uint blockOffsets, uint offsets, uint *sizes, uint numElements, uint totalBlocks)
	Reorders data in the global array. More...

template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void	radixSortBlocksKeysOnly (uint4 keysOut, uint4 keysIn, uint numElements, uint totalBlocks)
	Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements. More...

template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void	reorderDataKeysOnly (uint outKeys, uint2 keys, uint blockOffsets, uint offsets, uint *sizes, uint numElements, uint totalBlocks)
	Reorders data in the global array. More...

Rand Functions
__global__ void	gen_randMD5 (uint4 *d_out, size_t numElements, unsigned int seed)
	The main MD5 generation algorithm. More...

Reduce Functions
template<typename T , class Oper , unsigned int blockSize, bool nIsPow2>
__global__ void	reduce (T odata, const T idata, unsigned int n)
	Main reduction kernel. More...

Suffix Array Functions
typedef unsigned int	uint

typedef unsigned char	uchar

__global__ void	strConstruct (uchar d_str, uint d_str_value, size_t str_length)
	Construct the input array. More...

__global__ void	resultConstruct (uint *d_keys_sa, size_t str_length)
	Reconstruct the output. More...

__global__ void	sa12_keys_construct (uint d_str, uint d_keys_uint_12, uint *d_keys_srt_12, int mod_1, size_t tThreads)
	Initialize the SA12 triplets. More...

__global__ void	sa12_keys_construct_0 (uint d_str, uint d_keys_uint_12, uint *d_keys_srt_12, size_t tThreads)
	Construct SA12 for the second radix sort. More...

__global__ void	sa12_keys_construct_1 (uint d_str, uint d_keys_uint_12, uint *d_keys_srt_12, size_t tThreads)
	Construct SA12 for the third radix sort. More...

__global__ void	compute_rank (uint d_str, uint d_keys_srt_12, uint d_flag, bool result, size_t tThreads, int str_length)
	Turn on flags for sorted SA12 triplets. More...

__global__ void	new_str_construct (uint d_new_str, uint d_keys_srt_12, uint *d_rank, int mod_1, size_t tThreads)
	Construct new array for recursion. More...

__global__ void	reconstruct (uint d_keys_srt_12, uint d_isa_12, uint *d_flag, int mod_1, size_t tThreads)
	Translate SA12 from recursion. More...

__global__ void	isa12_construct (uint d_keys_srt_12, uint d_isa_12, uint *d_flag, int mod_1, size_t tThreads)
	Construct ISA12. More...

__global__ void	sa3_srt_construct (uint d_keys_srt_3, uint d_str, uint d_keys_srt_12, uint d_keys_sa, size_t tThreads1, size_t tThreads2, int str_length)
	Contruct SA3 triplets positions. More...

__global__ void	sa3_keys_construct (uint d_keys_srt_3, uint d_keys_sa, uint *d_str, size_t tThreads, int str_length)
	Construct SA3 triplets keys. More...

__global__ void	merge_akeys_construct (uint d_str, uint d_keys_srt_12, uint d_isa_12, Vector d_aKeys, size_t tThreads, int mod_1, int bound, int str_length)
	Construct SA12 keys in terms of Vector. More...

__global__ void	merge_bkeys_construct (uint d_str, uint d_keys_srt_3, uint d_isa_12, Vector d_bKeys, size_t tThreads, int mod_1, int bound, int str_length)
	Construct SA3 keys in Vector. More...

Scan Functions
template<class T , class traits >
__global__ void	scan4 (T d_out, const T d_in, T *d_blockSums, int numElements, unsigned int dataRowPitch, unsigned int blockSumRowPitch)
	Main scan kernel. More...

Segmented scan Functions
template<class T , class traits >
__global__ void	segmentedScan4 (T d_odata, const T d_idata, const unsigned int d_iflags, unsigned int numElements, T d_blockSums=0, unsigned int d_blockFlags=0, unsigned int d_blockIndices=0)
	Main segmented scan kernel. More...

Sparse Matrix-Vector multiply Functions
template<class T , bool isFullBlock>
__global__ void	sparseMatrixVectorFetchAndMultiply (unsigned int d_flags, T d_prod, const T d_A, const T d_x, const unsigned int *d_indx, unsigned int numNZElts)
	Fetch and multiply kernel. More...

__global__ void	sparseMatrixVectorSetFlags (unsigned int d_flags, const unsigned int d_rowindx, unsigned int numRows)
	Set Flags kernel. More...

template<class T >
__global__ void	yGather (T d_y, const T d_prod, const unsigned int *d_rowFindx, unsigned int numRows)
	Gather final y values kernel. More...

Tridiagonal functions
template<class T >
__global__ void	crpcrKernel (T d_a, T d_b, T d_c, T d_d, T *d_x, unsigned int systemSizeOriginal, unsigned int iterations)
	Hybrid CR-PCR Tridiagonal linear system solver (CRPCR) More...

Vector Functions
CUDA kernel methods for basic operations on vectors.
template<class T >
__global__ void	vectorAddConstant (T *d_vector, T constant, int n, int baseIndex)
	Adds a constant value to all values in the input d_vector. More...

template<class T >
__global__ void	vectorAddUniform (T d_vector, const T d_uniforms, int numElements, int blockOffset, int baseIndex)
	Add a uniform value to each data element of an array. More...

template<typename T >
__global__ void	vectorAddUniform2 (T g_data, T uniforms, int n, int eltsPerBlock)

template<class T , class Oper , int elementsPerThread, bool fullBlocks>
__global__ void	vectorAddUniform4 (T d_vector, const T d_uniforms, int numElements, int vectorRowPitch, int uniformRowPitch, int blockOffset, int baseIndex)
	Add a uniform value to each data element of an array (vec4 version) More...

template<class T >
__global__ void	vectorAddVector (T d_vectorA, const T d_vectorB, int numElements, int baseIndex)
	Adds together two vectors. More...

template<class T , class Oper , bool isLastBlockFull>
__global__ void	vectorSegmentedAddUniform4 (T d_vector, const T d_uniforms, const unsigned int *d_maxIndices, unsigned int numElements, int blockOffset, int baseIndex)
	Add a uniform value to data elements of an array (vec4 version) More...

template<class T , class Oper , bool isLastBlockFull>
__global__ void	vectorSegmentedAddUniformToRight4 (T d_vector, const T d_uniforms, const unsigned int *d_minIndices, unsigned int numElements, int blockOffset, int baseIndex)
	Add a uniform value to data elements of an array (vec4 version) More...

Detailed Description

The CUDPP Kernel-Level API contains functions that run on the GPU device across a grid of Cooperative Thread Array (CTA, aka Thread Block). These kernels are declared global so that they must be invoked from host (CPU) code. They generally invoke GPU device routines in the CUDPP CTA-Level API. Kernel-Level API functions are used by CUDPP Application-Level functions to implement their functionality.

Function Documentation

template<class T , bool isBackward>

__global__ void compactData	(	T *	d_out,
		size_t *	d_numValidElements,
		const unsigned int *	d_indices,
		const unsigned int *	d_isValid,
		const T *	d_in,
		unsigned int	numElements
	)

Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray().

Parameters

[out]	d_out	Output array of compacted values.
[out]	d_numValidElements	The number of elements in d_in with valid flags set to 1.
[in]	d_indices	Positions where non-null elements will go in d_out.
[in]	d_isValid	Flags indicating valid (1) and invalid (0) elements. Only valid elements will be copied to d_out.
[in]	d_in	The input array
[in]	numElements	The length of the d_in in elements.

__global__ void bwt_compute_final_kernel	(	const uchar *	d_bwtIn,
		const uint *	d_values,
		int *	d_bwtIndex,
		uchar *	d_bwtOut,
		uint	numElements,
		uint	tThreads
	)

Compute final BWT.

This is the final stage in the BWT. This stage computes the final values of the BWT output. It is given the indices of where each of the cyclical rotations of the initial input were sorted to. It uses these indices to figure out the last "column" of the sorted cyclical rotations which is the final BWT output.

Parameters

[in]	d_bwtIn	Input char array to perform the BWT on.
[in]	d_values	Input array that gives the indices of where each of the cyclical rotations of the intial input were sorted to.
[out]	d_bwtIndex	Output pointer to store the BWT index. The index tells us where the original string sorted to.
[out]	d_bwtOut	Output char array of the BWT.
[in]	numElements	The number of elements we are performing a BWT on.
[in]	tThreads	The total threads we have dispatched on the device.

template<class T , int depth>

__global__ void stringMergeMulti	(	T *	A_keys,
		T *	A_keys_out,
		T *	A_values,
		T *	A_values_out,
		T *	stringValues,
		int	subPartitions,
		int	numBlocks,
		int *	partitionBeginA,
		int *	partitionSizeA,
		int *	partitionBeginB,
		int *	partitionSizeB,
		int	entirePartitionSize,
		size_t	numElements
	)

Multi merge.

Parameters

[in]	A_keys	keys to be sorted
[out]	A_keys_out	keys after being sorted
[in]	A_values	associated values to keys
[out]	A_values_out	associated values after sort
[in]	stringValues	keys of each of the cyclical rotations
[in]	subPartitions	Number of blocks working on a partition (number of sub-partitions)
[in]	numBlocks
[out]	partitionBeginA	Where each partition/subpartition will begin in A
[in]	partitionSizeA	Partition sizes decided by function findMultiPartitions
[out]	partitionBeginB	Where each partition/subpartition will begin in B
[in]	partitionSizeB	Partition sizes decided by function findMultiPartitions
[in]	entirePartitionSize	The size of an entire partition (before it is split up)
[in]	numElements	Size of the enitre array

template<class T >

__global__ void findMultiPartitions	(	T *	A,
		int	splitsPP,
		int	numPartitions,
		int	partitionSize,
		int *	partitionBeginA,
		int *	partitionSizesA,
		int *	partitionBeginB,
		int *	partitionSizesB,
		int	sizeA
	)

Merges the indices for the "upper" block (right block)

Utilizes a "ping-pong" strategy

Parameters

[in]	A	Global array of keys
[in]	splitsPP	Global array of values to be merged
[in]	numPartitions	number of partitions being considered
[in]	partitionSize	Size of each partition being considered
[out]	partitionBeginA	Where each partition/subpartition will begin in A
[out]	partitionSizesA	Size of each partition/subpartition in A
[out]	partitionBeginB	Where each partition/subpartition will begin in B
[out]	partitionSizesB	Size of each partition/subpartition in B
[in]	sizeA	Size of the entire array

template<class T , int depth>

__global__ void simpleStringMerge	(	T *	A_keys,
		T *	A_keys_out,
		T *	A_values,
		T *	A_values_out,
		T *	stringValues,
		int	sizePerPartition,
		int	size,
		T *	stringValues2,
		size_t	numElements
	)

Simple merge.

Parameters

[in]	A_keys	keys to be sorted
[out]	A_keys_out	keys after being sorted
[in]	A_values	associated values to keys
[out]	A_values_out	associated values after sort
[in]	stringValues	BWT string manipulated to words
[in]	sizePerPartition	Size of each partition being merged
[in]	size	Size of total Array being sorted
[in]	stringValues2	keys of each of the cyclical rotations
[in]	numElements	Number of elements being sorted

template<class T , int depth>

__global__ void blockWiseStringSort	(	T *	A_keys,
		T *	A_address,
		const T *	stringVals,
		T *	stringVals2,
		int	blockSize,
		size_t	numElements
	)

Sorts blocks of data of size blockSize.

Parameters

[in,out]	A_keys	keys to be sorted
[in,out]	A_address	associated values to keys
[in]	stringVals	BWT string manipulated to words
[in]	stringVals2	keys of each of the cyclical rotations
[in]	blockSize	Size of the chunks being sorted
[in]	numElements	Size of the enitre array

__global__ void bwt_keys_construct_kernel	(	uchar4 *	d_bwtIn,
		uint *	d_bwtInRef,
		uint *	d_keys,
		uint *	d_values,
		uint *	d_bwtInRef2,
		uint	tThreads
	)

Massage input to set up for merge sort.

Parameters

[in]	d_bwtIn	A char array of the input data stream to perform the BWT on.
[out]	d_bwtInRef	BWT string manipulated to words.
[out]	d_keys	An array of associated keys to sort by the first four chars of the cyclical rotations.
[out]	d_values	Array of values associates with the keys to sort.
[out]	d_bwtInRef2	keys of each of the cyclical rotations.
[in]	tThreads	Pointer to the plan object used for this BWT.

__global__ void mtf_reduction_kernel	(	const uchar *	d_mtfIn,
		uchar *	d_lists,
		ushort *	d_list_sizes,
		uint	nLists,
		uint	offset,
		uint	numElements
	)

First stage in MTF (Reduction)

Parameters

[in]	d_mtfIn	A char array of the input data stream to perform the MTF on.
[out]	d_lists	A pointer to the start of MTF lists.
[out]	d_list_sizes	An array storing the size of each MTF list.
[in]	nLists	Total number of MTF lists.
[in]	offset	The offset during the reduction stage. Initialized to two.
[in]	numElements	Total number of input elements MTF transform.

__global__ void mtf_GLreduction_kernel	(	uchar *	d_lists,
		ushort *	d_list_sizes,
		uint	offset,
		uint	tThreads,
		uint	nLists
	)

Second stage in MTF (Global reduction)

Parameters

[in,out]	d_lists	A pointer to the start of MTF lists.
[in,out]	d_list_sizes	An array storing the size of each MTF list.
[in]	offset	The offset during the reduction stage. Initialized to two.
[in]	tThreads	Total number of threads dispatched.
[in]	nLists	Total number of MTF lists.

__global__ void mtf_GLdownsweep_kernel	(	uchar *	d_lists,
		ushort *	d_list_sizes,
		uint	offset,
		uint	lastLevel,
		uint	nLists,
		uint	tThreads
	)

Third stage in MTF (Global downsweep)

Parameters

[in,out]	d_lists	A pointer to the start of MTF lists.
[in,out]	d_list_sizes	An array storing the size of each MTF list.
[in]	offset	The offset during the reduction stage.
[in]	lastLevel	The limit to which offset can be set to.
[in]	nLists	Total number of MTF lists.
[in]	tThreads	Total number of threads dispatched.

__global__ void mtf_localscan_lists_kernel	(	const uchar *	d_mtfIn,
		uchar *	d_mtfOut,
		uchar *	d_lists,
		ushort *	d_list_sizes,
		uint	nLists,
		uint	offset,
		uint	numElements
	)

Compute final MTF lists and final MTF output.

Parameters

[in]	d_mtfIn	A char array of the input data stream to perform the MTF on.
[out]	d_mtfOut	A char array of the output with the transformed MTF string.
[in,out]	d_lists	A pointer to the start of MTF lists.
[in]	d_list_sizes	An array storing the size of each MTF list.
[in]	nLists	Total number of MTF lists.
[in]	offset	The offset during the reduction stage.
[in]	numElements	Total number of elements to perform the MTF on.

__global__ void huffman_build_histogram_kernel	(	uint *	d_input,
		uint *	d_histograms,
		uint	numElements
	)

Compute 256-entry histogram.

Parameters

[in]	d_input	An array of words we will use to build our histogram.
[out]	d_histograms	A pointer where we store our global histograms.
[in]	numElements	The total number of elements to build our histogram from.

__global__ void huffman_build_tree_kernel	(	const uchar *	d_input,
		uchar *	d_huffCodesPacked,
		uint *	d_huffCodeLocations,
		uchar *	d_huffCodeLengths,
		uint *	d_histograms,
		uint *	d_histogram,
		uint *	d_nCodesPacked,
		uint *	d_totalEncodedSize,
		uint	histBlocks,
		uint	numElements
	)

Build Huffman tree/codes.

Parameters

[in]	d_input	An array of input elements to encode
[out]	d_huffCodesPacked	An array of huffman bit codes packed together
[out]	d_huffCodeLocations	An array which stores the starting bit locations of each Huffman bit code
[out]	d_huffCodeLengths	An array which stores the lengths of each Huffman bit code
[in]	d_histograms	An input array of histograms to combine
[out]	d_histogram	Final histogram combined
[out]	d_nCodesPacked	Number of chars it took to store all Huffman bit codes
[out]	d_totalEncodedSize	Total number of words it takes to hold the compressed data
[in]	histBlocks	Total number of histograms we will combine into one
[in]	numElements	Number of elements to compress

__global__ void huffman_kernel_en	(	uchar4 *	d_input,
		uchar *	d_codes,
		uint *	d_code_locations,
		uchar *	d_huffCodeLengths,
		encoded *	d_encoded,
		uint	nCodesPacked,
		uint	nThreads
	)

Perform parallel Huffman encoding.

Parameters

[in]	d_input	Input array to encode
[in]	d_codes	Array of packed Huffman bit codes
[in]	d_code_locations	Array of starting Huffman bit locations
[in]	d_huffCodeLengths	An array storing the bit lengths of the Huffman codes
[out]	d_encoded	An array of encoded classes which stores the size and data of encoded data
[in]	nCodesPacked	Number of chars it took to store all Huffman bit codes
[in]	nThreads	Total number of dispatched threads

__global__ void huffman_datapack_kernel	(	encoded *	d_encoded,
		uint *	d_encodedData,
		uint *	d_totalEncodedSize,
		uint *	d_eOffsets
	)

Pack together encoded blocks.

Parameters

[in]	d_encoded	An array of encoded objects with stored size and data of the encoded data.
[out]	d_encodedData	An in array to store all encoded data.
[out]	d_totalEncodedSize	Total number words of the encoded data.
[out]	d_eOffsets	Array holding the word offsets of each encoded data block.

template<typename T >

__global__ void list_rank_kernel_soa_1	(	T *	d_ranked_values,
		const T *	d_unranked_values,
		const int *	d_ping,
		int *	d_pong,
		int *	d_start_indices,
		int	step,
		int	head,
		int	numElts
	)

Use pointer jumping to rank values. After ranking the values, calculate the next set of indices. The number of values ranked doubles at each kernel call. Called by listRank().

Parameters

[out]	d_ranked_values	Ranked values array
[in]	d_unranked_values	Unranked values array
[in]	d_ping	Next indices array for the current kernel call
[in]	d_pong	Next indices array for the next kernel call
[in]	d_start_indices	Holds the starting node indices for "ranking" threads. The number of "ranking" threads doubles at each stage.
[in]	step	The number of "ranking" threads.
[in]	head	Head node index of the linked-list.
[in]	numElts	Number of nodes to rank

template<typename T >

__global__ void list_rank_kernel_soa_2	(	T *	d_ranked_values,
		const T *	d_unranked_values,
		const int *	d_pong,
		const int *	d_start_indices,
		int	head,
		int	numElts
	)

After pointer jumping is finished and all threads are able to rank values, ranking continues serially. Each thread ranks values until all values are ranked. Called by listRank().

Parameters

[out]	d_ranked_values	Ranked values array
[in]	d_unranked_values	Unranked values array
[in]	d_pong	Next indices array for the current kernel call
[in]	d_start_indices	Holds the starting node indices for "ranking" threads. The number of "ranking" threads doubles at each stage.
[in]	head	Head node index of the linked-list.
[in]	numElts	Number of nodes to rank

template<class T >

__global__ void simpleCopy	(	T *	A_keys_dev,
		unsigned int *	A_vals_dev,
		T *	A_keys_out_dev,
		unsigned int *	A_vals_out_dev,
		int	offset,
		int	numElementsToCopy
	)

Copies unused portions of arrays in our ping-pong strategy.

Parameters

[in]	A_keys_dev,A_vals_dev	The keys and values we will be copying
[out]	A_keys_out_dev,A_vals_out_dev	The keys and values array we will copy to
[in]	offset	The offset we are starting to copy from
[in]	numElementsToCopy	The number of elements we copy starting from the offset
[in]	A_keys_dev	The keys we will be copying
[in]	A_vals_dev	The values we will be copying
[out]	A_keys_out_dev	The destination keys array
[out]	A_vals_out_dev	The destination values array
[in]	offset	The offset we are starting to copy from
[in]	numElementsToCopy	The number of elements we copy starting from the offset

template<class T , int depth>

__global__ void blockWiseSort	(	T *	A_keys,
		unsigned int *	A_values,
		int	blockSize,
		size_t	totalSize
	)

Sorts blocks of data of size blockSize.

Parameters

[in,out]	A_keys	keys to be sorted
[in,out]	A_values	associated values to keys
[in]	blockSize	Size of the chunks being sorted
[in]	totalSize	Size of the enitre array

template<class T , int depth>

__global__ void simpleMerge_lower	(	T *	A_keys,
		unsigned int *	A_values,
		T *	A_keys_out,
		unsigned int *	A_values_out,
		int	sizePerPartition,
		int	size
	)

Merges the indices for the "lower" block (left block)

Utilizes a "ping-pong" strategy

Parameters

[in]	A_keys	Global array of keys to be merged
[in]	A_values	Global array of values to be merged
[out]	A_keys_out	Resulting array of keys merged
[out]	A_values_out	Resulting array of values merged
[in]	sizePerPartition	Size of each partition being merged
[in]	size	Size of total Array being sorted

template<class T , int depth>

__global__ void simpleMerge_higher	(	T *	A_keys,
		unsigned int *	A_values,
		T *	A_keys_out,
		unsigned int *	A_values_out,
		int	sizePerPartition,
		int	size
	)

Merges the indices for the "upper" block (right block)

Utilizes a "ping-pong" strategy

Parameters

[in]	A_keys	Global array of keys to be merged
[in]	A_values	Global array of values to be merged
[out]	A_keys_out	Resulting array of keys merged
[out]	A_values_out	Resulting array of values merged
[in]	sizePerPartition	Size of each partition being merged
[in]	size	Size of total Array being sorted

template<class T >

__global__ void findMultiPartitions	(	T *	A,
		int	splitsPP,
		int	numPartitions,
		int	partitionSize,
		int *	partitionBeginA,
		int *	partitionSizesA,
		int	sizeA
	)

Merges the indices for the "upper" block (right block)

Utilizes a "ping-pong" strategy

Parameters

[in]	A	Global array of keys
[in]	splitsPP	Global array of values to be merged
[in]	numPartitions	number of partitions being considered
[in]	partitionSize	Size of each partition being considered
[out]	partitionBeginA	Where each partition/subpartition will begin in A
[out]	partitionSizesA	Size of each partition/subpartition in A
[in]	sizeA	Size of the entire array

template<class T , int depth>

__global__ void mergeMulti_lower	(	T *	A_keys_out,
		unsigned int *	A_vals_out,
		T *	A_keys,
		unsigned int *	A_vals,
		int	subPartitions,
		int	numBlocks,
		int *	partitionBeginA,
		int *	partitionSizeA,
		int	entirePartitionSize,
		int	sizeA
	)

Blocks cooperatively Merge two partitions for the indices in the "lower" block (left block)

Utilizes a "ping-pong" strategy

Parameters

[out]	A_keys_out	Resulting array of keys merged
[out]	A_vals_out	Resulting array of values merged
[in]	A_keys	Global array of keys to be merged
[in]	A_vals	Global array of values to be merged
[in]	subPartitions	Number of blocks working on a partition (number of sub-partitions)
[in]	numBlocks
[in]	partitionBeginA	Partition starting points decided by function findMultiPartitions
[in]	partitionSizeA	Partition sizes decided by function findMultiPartitions
[in]	entirePartitionSize	The size of an entire partition (before it is split up)
[in]	sizeA	The total size of our array

template<class T , int depth>

__global__ void mergeMulti_higher	(	T *	A_keys_out,
		unsigned int *	A_vals_out,
		T *	A_keys,
		unsigned int *	A_vals,
		int	subPartitions,
		int	numBlocks,
		int *	partitionBeginA,
		int *	partitionSizeA,
		int	entirePartitionSize,
		int	sizeA
	)

Blocks cooperatively Merge two partitions for the indices in the "upper" block (right block)

Utilizes a "ping-pong" strategy

Parameters

[out]	A_keys_out	Resulting array of keys merged
[out]	A_vals_out	Resulting array of values merged
[in]	A_keys	Global array of keys to be merged
[in]	A_vals	Global array of values to be merged
[in]	subPartitions	Number of blocks working on a partition (number of sub-partitions)
[in]	numBlocks
[in]	partitionBeginA	Partition starting points decided by function findMultiPartitions
[in]	partitionSizeA	Partition sizes decided by function findMultiPartitions
[in]	entirePartitionSize	The size of an entire partition (before it is split up)
[in]	sizeA	The total size of our array

__global__ void flipFloats	(	uint *	values,
		uint	numValues
	)

Does special binary arithmetic before sorting floats.

Uses floatFlip function to flip bits.

Parameters

[in,out]	values	Values to be manipulated
[in]	numValues	Number of values to be flipped

__global__ void unflipFloats	(	uint *	values,
		uint	numValues
	)

Undoes the flips from flipFloats.

Uses floatUnflip function to unflip bits.

Parameters

[in,out]	values	Values to be manipulated
[in]	numValues	Number of values to be unflipped

template<bool flip>

__global__ void radixSortSingleWarp	(	uint *	keys,
		uint *	values,
		uint	numElements
	)

Optimization for sorts of WARP_SIZE or fewer elements.

Parameters

[in,out]	keys	Keys to be sorted.
[in,out]	values	Associated values to be sorted (through keys).
[in]	numElements	Number of elements in the sort.

template<bool flip>

__global__ void radixSortSingleWarpKeysOnly	(	uint *	keys,
		uint	numElements
	)

Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.

Parameters

[in,out]	keys	Keys to be sorted
[in]	numElements	Total number of elements to be sorted

template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>

__global__ void radixSortBlocks	(	uint4 *	keysOut,
		uint4 *	valuesOut,
		uint4 *	keysIn,
		uint4 *	valuesIn,
		uint	numElements,
		uint	totalBlocks
	)

sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements

The radix sort is done in two stages. This stage calls radixSortBlock on each block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "flip" is used to only compile in the float flip code when float keys are used. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters

[out]	keysOut	Output of sorted keys
[out]	valuesOut	Output of associated values
[in]	keysIn	Input of unsorted keys in GPU
[in]	valuesIn	Input of associated input values
[in]	numElements	Total number of elements to sort
[in]	totalBlocks	The number of blocks of data to sort

template<uint startbit, bool fullBlocks, bool loop>

__global__ void findRadixOffsets	(	uint2 *	keys,
		uint *	counters,
		uint *	blockOffsets,
		uint	numElements,
		uint	totalBlocks
	)

Computes the number of keys of each radix in each block stores offset.

Given an array with blocks sorted according to a 4-bit radix group, each block counts the number of keys that fall into each radix in the group, and finds the starting offset of each radix in the block. It then writes the radix counts to the counters array, and the starting offsets to the blockOffsets array.

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters

[in]	keys	Input keys
[out]	counters	Radix count for each block
[out]	blockOffsets	The offset address for each block
[in]	numElements	Total number of elements
[in]	totalBlocks	Total number of blocks

template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>

__global__ void reorderData	(	uint *	outKeys,
		uint *	outValues,
		uint2 *	keys,
		uint2 *	values,
		uint *	blockOffsets,
		uint *	offsets,
		uint *	sizes,
		uint	numElements,
		uint	totalBlocks
	)

Reorders data in the global array.

reorderData shuffles data in the array globally after the radix offsets have been found. On compute version 1.1 and earlier GPUs, this code depends on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).

On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures that all writes are coalesced using extra work in the kernel. On later GPUs coalescing rules have been relaxed, so this extra overhead hurts performance. On these GPUs we set manualCoalesce=false and directly store the results.

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters

[out]	outKeys	Output of sorted keys
[out]	outValues	Output of associated values
[in]	keys	Input of unsorted keys in GPU
[in]	values	Input of associated input values
[in]	blockOffsets	The offset address for each block
[in]	offsets	Address of each radix within each block
[in]	sizes	Number of elements in a block
[in]	numElements	Total number of elements
[in]	totalBlocks	Total number of data blocks to process

Todo:: Args that are const below should be prototyped as const

template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>

__global__ void radixSortBlocksKeysOnly	(	uint4 *	keysOut,
		uint4 *	keysIn,
		uint	numElements,
		uint	totalBlocks
	)

Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements.

The radix sort is done in two stages. This stage calls radixSortBlock on each block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "flip" is used to only compile in the float flip code when float keys are used. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters

[out]	keysOut	Output of sorted keys GPU main memory
[in]	keysIn	Input of unsorted keys in GPU main memory
[in]	numElements	Total number of elements to sort
[in]	totalBlocks	Total number of blocks to sort

template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>

__global__ void reorderDataKeysOnly	(	uint *	outKeys,
		uint2 *	keys,
		uint *	blockOffsets,
		uint *	offsets,
		uint *	sizes,
		uint	numElements,
		uint	totalBlocks
	)

Reorders data in the global array.

reorderDataKeysOnly shuffles data in the array globally after the radix offsets have been found. On compute version 1.1 and earlier GPUs, this code depends on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).

On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures that all writes are coalesced using extra work in the kernel. On later GPUs coalescing rules have been relaxed, so this extra overhead hurts performance. On these GPUs we set manualCoalesce=false and directly store the results.

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters

[out]	outKeys	Output result of reorderDataKeysOnly()
[in]	keys	Keys to be reordered
[in]	blockOffsets	Start offset for each block
[in]	offsets	Offset of each radix within each block
[in]	sizes	Number of elements in a block
[in]	numElements	Total number of elements
[in]	totalBlocks	Total number of blocks

__global__ void gen_randMD5	(	uint4 *	d_out,
		size_t	numElements,
		unsigned int	seed
	)

The main MD5 generation algorithm.

This function runs the MD5 hashing random number generator. It generates MD5 hashes, and uses the output as randomized bits. To repeatedly call this function, always call cudppRandSeed() first to set a new seed or else the output may be the same due to the deterministic nature of hashes. gen_randMD5 generates 128 random bits per thread. Therefore, the parameter d_out is expected to be an array of type uint4 with numElements indicies.

Parameters

[out]	d_out	the output array of type uint4.
[in]	numElements	the number of elements in d_out
[in]	seed	the random seed used to vary the output

See Also: launchRandMD5Kernel()

template<typename T , class Oper , unsigned int blockSize, bool nIsPow2>

__global__ void reduce	(	T *	odata,
		const T *	idata,
		unsigned int	n
	)

Main reduction kernel.

This reduction kernel adds multiple elements per thread sequentially, and then the threads work together to produce a block sum in shared memory. The code is optimized using warp-synchronous programming to eliminate unnecessary barrier synchronization. Performing sequential work in each thread before performing the log(N) parallel summation reduces the overall cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). (Brent's Theorem optimization)

Parameters

[out]	odata	The output data pointer. Each block writes a single output element.
[in]	idata	The input data pointer.
[in]	n	The number of elements to be reduced.

__global__ void strConstruct	(	uchar *	d_str,
		uint *	d_str_value,
		size_t	str_length
	)

Construct the input array.

This is the first stage in the SA. This stage construct the input array composed of values of the input char array followed by three 0s.

Parameters

[in]	d_str	Input char array to perform the SA on.
[out]	d_str_value	Output unsigned int array prepared for SA.
[in]	str_length	The number of elements we are performing the SA on.

__global__ void resultConstruct	(	uint *	d_keys_sa,
		size_t	str_length
	)

Reconstruct the output.

This is the final stage in the SA. This stage reconstruct the output array by reducing each value by one.

Parameters

[in,out]	d_keys_sa	Final output of the suffix array which stores the positions of sorted suffixes.
[in]	str_length	Size of the array.

__global__ void sa12_keys_construct	(	uint *	d_str,
		uint *	d_keys_uint_12,
		uint *	d_keys_srt_12,
		int	mod_1,
		size_t	tThreads
	)

Initialize the SA12 triplets.

Parameters

[in]	d_str	Initial array of character values.
[out]	d_keys_uint_12	The keys of righ-most char in SA12 triplets.
[out]	d_keys_srt_12	SA12 triplets positions.
[in]	mod_1	The number of elements whose positions mod3 = 1 (SA1)
[in]	tThreads	The number of elements whose positions mod3 = 1,2 (SA12)

__global__ void sa12_keys_construct_0	(	uint *	d_str,
		uint *	d_keys_uint_12,
		uint *	d_keys_srt_12,
		size_t	tThreads
	)

Construct SA12 for the second radix sort.

Parameters

[in]	d_str	Initial array of character values.
[out]	d_keys_uint_12	The keys of second char in SA12 triplets.
[in]	d_keys_srt_12	SA12 triplets positions.
[in]	tThreads	The number of elements in SA12.

__global__ void sa12_keys_construct_1	(	uint *	d_str,
		uint *	d_keys_uint_12,
		uint *	d_keys_srt_12,
		size_t	tThreads
	)

Construct SA12 for the third radix sort.

Parameters

[in]	d_str	Initial array of character values.
[out]	d_keys_uint_12	The keys of third char in SA12 triplets.
[in]	d_keys_srt_12	SA12 triplets positions.
[in]	tThreads	The number of elements in SA12.

__global__ void compute_rank	(	uint *	d_str,
		uint *	d_keys_srt_12,
		uint *	d_flag,
		bool *	result,
		size_t	tThreads,
		int	str_length
	)

Turn on flags for sorted SA12 triplets.

Parameters

[in]	d_str	Initial array of character values.
[in]	d_keys_srt_12	SA12 triplets positions.
[out]	d_flag	Marking the sorted triplets.
[out]	result	0 if SA12 is not fully sorted.
[in]	tThreads	The number of elements in SA12.
[in]	str_length	The number of elements in original string.

__global__ void new_str_construct	(	uint *	d_new_str,
		uint *	d_keys_srt_12,
		uint *	d_rank,
		int	mod_1,
		size_t	tThreads
	)

Construct new array for recursion.

Parameters

[out]	d_new_str	The new string to be sent to recursion.
[in]	d_keys_srt_12	SA12 triplets positions.
[in]	d_rank	Ranks of SA12 from compute_rank kernel.
[in]	mod_1	The number of elements of SA1.
[in]	tThreads	The number of elements of SA12.

__global__ void reconstruct	(	uint *	d_keys_srt_12,
		uint *	d_isa_12,
		uint *	d_flag,
		int	mod_1,
		size_t	tThreads
	)

Translate SA12 from recursion.

Parameters

[in,out]	d_keys_srt_12	Sorted SA12.
[in]	d_isa_12	ISA12.
[in]	d_flag	Flags to mark SA1.
[in]	mod_1	The number of elements in SA1.
[in]	tThreads	The number of elements in SA12.

__global__ void isa12_construct	(	uint *	d_keys_srt_12,
		uint *	d_isa_12,
		uint *	d_flag,
		int	mod_1,
		size_t	tThreads
	)

Construct ISA12.

Parameters

[in]	d_keys_srt_12	Fully sorted SA12 in global position.
[out]	d_isa_12	ISA12 to store the ranks in local position.
[out]	d_flag	Flags to mark SA1.
[in]	mod_1	The number of elements in SA1.
[in]	tThreads	The number of elements in SA12.

__global__ void sa3_srt_construct	(	uint *	d_keys_srt_3,
		uint *	d_str,
		uint *	d_keys_srt_12,
		uint *	d_keys_sa,
		size_t	tThreads1,
		size_t	tThreads2,
		int	str_length
	)

Contruct SA3 triplets positions.

Parameters

[out]	d_keys_srt_3	SA3 generated from SA1.
[in]	d_str	Original input array.
[in]	d_keys_srt_12	Fully sorted SA12.
[in]	d_keys_sa	Positions of SA1.
[in]	tThreads1	The number of elements of SA12.
[in]	tThreads2	The number of elements of SA3.
[in]	str_length	The number of elements in original string.

__global__ void sa3_keys_construct	(	uint *	d_keys_srt_3,
		uint *	d_keys_sa,
		uint *	d_str,
		size_t	tThreads,
		int	str_length
	)

Construct SA3 triplets keys.

Parameters

[in]	d_keys_srt_3	SA3 triplets positions.
[out]	d_keys_sa	SA3 keys.
[in]	d_str	Original input string.
[in]	tThreads	The number of elements in SA12.
[in]	str_length	The number of elements in original string.

__global__ void merge_akeys_construct	(	uint *	d_str,
		uint *	d_keys_srt_12,
		uint *	d_isa_12,
		Vector *	d_aKeys,
		size_t	tThreads,
		int	mod_1,
		int	bound,
		int	str_length
	)

Construct SA12 keys in terms of Vector.

Parameters

[in]	d_str	Original input data stream
[in]	d_keys_srt_12	The order of aKeys.
[in]	d_isa_12	The ranks in SA12 orders.
[out]	d_aKeys	SA12 keys in Vectors.
[in]	tThreads	The number elements in SA12
[in]	mod_1	The number of elements in SA1.
[in]	bound	The number of elements in SA12 plus SA3.
[in]	str_length	The number of elements in original string.

__global__ void merge_bkeys_construct	(	uint *	d_str,
		uint *	d_keys_srt_3,
		uint *	d_isa_12,
		Vector *	d_bKeys,
		size_t	tThreads,
		int	mod_1,
		int	bound,
		int	str_length
	)

Construct SA3 keys in Vector.

Parameters

[in]	d_str	Original input data stream.
[in]	d_keys_srt_3	The order of bKeys
[in]	d_isa_12	ISA12.
[out]	d_bKeys	SA3 keys in Vectors.
[in]	tThreads	The number of total threads.
[in]	mod_1	The number of elements in SA1.
[in]	bound	The number of elements in SA12 and SA3.
[in]	str_length	The number of elements in original str.

template<class T , class traits >

__global__ void scan4	(	T *	d_out,
		const T *	d_in,
		T *	d_blockSums,
		int	numElements,
		unsigned int	dataRowPitch,
		unsigned int	blockSumRowPitch
	)

Main scan kernel.

This global device function performs one level of a multiblock scan on an arbitrary-dimensioned array in d_in, returning the result in d_out (which may point to the same array). The same function may be used for single or multi-row scans. To perform a multirow scan, pass the width of each row of the input row (in elements) in dataRowPitch, and the width of the rows of d_blockSums (in elements) in blockSumRowPitch, and invoke with a thread block grid with height greater than 1.

This function peforms one level of a recursive, multiblock scan. At the app level, this function is called by cudppScan and cudppMultiScan and used in combination with vectorAddUniform4() to produce a complete scan.

Template parameter T is the datatype of the array to be scanned. Template parameter traits is the ScanTraits struct containing compile-time options for the scan, such as whether it is forward or backward, exclusive or inclusive, multi- or single-row, etc.

Parameters

[out]	d_out	The output (scanned) array
[in]	d_in	The input array to be scanned
[out]	d_blockSums	The array of per-block sums
[in]	numElements	The number of elements to scan
[in]	dataRowPitch	The width of each row of d_in in elements (for multi-row scans)
[in]	blockSumRowPitch	The with of each row of d_blockSums in elements (for multi-row scans)

template<class T , class traits >

__global__ void segmentedScan4	(	T *	d_odata,
		const T *	d_idata,
		const unsigned int *	d_iflags,
		unsigned int	numElements,
		T *	d_blockSums = `0`,
		unsigned int *	d_blockFlags = `0`,
		unsigned int *	d_blockIndices = `0`
	)

Main segmented scan kernel.

This global device function performs one level of a multiblock segmented scan on an one-dimensioned array in d_idata, returning the result in d_odata (which may point to the same array).

This function performs one level of a recursive, multiblock scan. At the app level, this function is called by cudppSegmentedScan and used in combination with either vectorSegmentedAddUniform4() (forward) or vectorSegmentedAddUniformToRight4() (backward) to produce a complete segmented scan.

Template parameter T is the datatype of the array to be scanned. Template parameter traits is the SegmentedScanTraits struct containing compile-time options for the segmented scan, such as whether it is forward or backward, inclusive or exclusive, etc.

Parameters

[out]	d_odata	The output (scanned) array
[in]	d_idata	The input array to be scanned
[in]	d_iflags	The input array of flags
[out]	d_blockSums	The array of per-block sums
[out]	d_blockFlags	The array of per-block OR-reduction of flags
[out]	d_blockIndices	The array of per-block min-reduction of indices
[in]	numElements	The number of elements to scan

template<class T , bool isFullBlock>

__global__ void sparseMatrixVectorFetchAndMultiply	(	unsigned int *	d_flags,
		T *	d_prod,
		const T *	d_A,
		const T *	d_x,
		const unsigned int *	d_indx,
		unsigned int	numNZElts
	)

Fetch and multiply kernel.

This global device function takes an element from the vector d_A, finds its column in d_indx and multiplies the element from d_A with its corresponding (that is having the same row) element in d_x and stores the resulting product in d_prod. It also sets all the elements of d_flags to 0.

Template parameter T is the datatype of the matrix A and x.

Parameters

[out]	d_flags	The output flags array
[out]	d_prod	The output products array
[in]	d_A	The input matrix A
[in]	d_x	The input array x
[in]	d_indx	The input array of column indices for each element in A
[in]	numNZElts	The number of non-zero elements in matrix A

__global__ void sparseMatrixVectorSetFlags	(	unsigned int *	d_flags,
		const unsigned int *	d_rowindx,
		unsigned int	numRows
	)

Set Flags kernel.

This global device function takes an element from the vector d_rowindx, and sets the corresponding position in d_flags to 1

Parameters

[out]	d_flags	The output flags array
[in]	d_rowindx	The starting index of each row in the "flattened" version of matrix A
[in]	numRows	The number of rows in matrix A

template<class T >

__global__ void yGather	(	T *	d_y,
		const T *	d_prod,
		const unsigned int *	d_rowFindx,
		unsigned int	numRows
	)

Gather final y values kernel.

This global device function takes an element from the vector d_rowFindx, which for each row gives the index of the last element of that row, reads the corresponding position in d_prod and write it in d_y

Template parameter T is the datatype of the matrix A and x.

Parameters

[out]	d_y	The output result array
[in]	d_prod	The input products array (which now contains sums for each row)
[in]	d_rowFindx	The starting index of each row in the "flattened" version of matrix A
[in]	numRows	The number of rows in matrix A

__global__ void alignedOffsets	(	unsigned int *	numSpaces,
		unsigned int *	d_address,
		unsigned char *	d_stringVals,
		unsigned char	termC,
		unsigned int	numElements,
		unsigned int	stringSize
	)

Calculate the number of spaces required for each string to align the string array.

Parameters

[out]	numSpaces	Number of spaces required for each string
[in]	d_address	Input addresses of each string
[in]	d_stringVals	String array
[in]	termC	Termination character for the strings
[in]	numElements	Number of strings
[in]	stringSize	Number of characters in the string array

__global__ void alignString	(	unsigned int *	packedStrings,
		unsigned char *	d_stringVals,
		unsigned int *	packedAddress,
		unsigned int *	address,
		unsigned int	numElements,
		unsigned int	stringArrayLength,
		unsigned char	termC
	)

Packs strings into unsigned ints to be sorted later. These packed strings will also be aligned.

Parameters

[out]	packedStrings	Resulting packed strings.
[in]	d_stringVals	Unpacked string array which we will pack
[out]	packedAddress	Resulting addresses for each string to the packedStrings array
[in]	address	Input addresses of unpacked strings
[in]	numElements	Number of strings
[in]	stringArrayLength	Number of characters in the string array
[in]	termC	Termination character for the strings

__global__ void createKeys	(	unsigned int *	d_keys,
		unsigned int *	packedStrings,
		unsigned int *	packedAddress,
		unsigned int	numElements
	)

Create keys (first four characters stuffed in an uint) from the addresses to the strings, and the string array.

Parameters

[out]	d_keys	Resulting keys
[in]	packedStrings	Packed string array.
[in]	packedAddress	Addresses which point to the string array.
[in]	numElements	Number of strings

__global__ void unpackAddresses	(	unsigned int *	packedAddress,
		unsigned int *	packedAddressRef,
		unsigned int *	address,
		unsigned int *	addressRef,
		size_t	numElements
	)

Converts addresses from packed (unaligned) form to unpacked and unaligned form Resulting aligned strings begin in our string array packed in an unsigned int and aligned such that each string begins at the start of a uint (divisible by 4)

Parameters

[in]	packedAddress	Resulting packed addresses that have been sorted. All strings are aligned.
[in]	packedAddressRef	Original array after packing (before sort). Used as a reference.
[out]	address	Final output of sorted addresses in unpacked form.
[in]	addressRef	Reference array of original unpacked addresses.
[in]	numElements	Number of strings

template<class T , int depth>

__global__ void blockWiseStringSort	(	T *	A_keys,
		T *	A_address,
		T *	stringVals,
		int	blockSize,
		int	totalSize,
		unsigned int	stringSize,
		unsigned char	termC
	)

Does an initial blockSort based on the size of our partition (limited by shared memory size)

Parameters

[in,out]	A_keys,A_address	This sort is in-place. A_keys and A_address store the key (first four characters) and addresses of our strings
[in]	stringVals	Global array of strings for tie breaks
[in]	blockSize	size of each block
[in]	totalSize	The total size of the array we are sorting
[in]	stringSize	The size of our string array (stringVals)
[in]	termC	Termination character for the strings

template<class T , int depth>

__global__ void simpleStringMerge	(	T *	A_keys,
		T *	A_keys_out,
		T *	A_values,
		T *	A_values_out,
		T *	stringValues,
		int	sizePerPartition,
		int	size,
		int	step,
		int	stringSize,
		unsigned char	termC
	)

Merges two independent sets. Each CUDA block works on two partitions of data without cooperating.

Parameters

[in]	A_keys	First four characters (input) of our sets to merge
[in]	A_values	Addresses of the strings (for tie breaks)
[in]	stringValues	Global string array for tie breaks
[out]	A_keys_out,A_values_out	Keys and values array after merge step
[in]	sizePerPartition	The size of each partition for this merge step
[in]	size	Global size of our array
[in]	step	Number of merges done so far
[in]	stringSize	global string length
[in]	termC	Termination character for the strings

template<class T >

__global__ void findMultiPartitions	(	T *	A_keys,
		T *	A_address,
		T *	stringValues,
		int	splitsPP,
		int	numPartitions,
		int	partitionSize,
		unsigned int *	partitionBeginA,
		unsigned int *	partitionSizesA,
		unsigned int *	partitionBeginB,
		unsigned int *	partitionSizesB,
		size_t	size,
		size_t	stringSize,
		unsigned char	termC
	)

For our multiMerge kernels we need to divide our partitions into smaller partitions. This kernel breaks up a set of partitions into splitsPP*numPartitions subpartitions.

Parameters

[in]	A_keys,A_address	First four characters (input), and addresses of our inputs
[in]	stringValues	Global string array for tie breaks
[in]	splitsPP,numPartitions,partitionSize	Partition information for this routine (splitsPP=splits Per Partition)
[in]	partitionBeginA,partitionSizesA	Partition starting points and sizes for each new subpartition in our original set in A
[in]	partitionBeginB,partitionSizesB	Partition starting points and sizes for each new subpartition in our original set in B
[in]	size,stringSize	Number of elements in our set, and size of our global string array
[in]	termC	Termination character for the strings

template<class T , int depth>

__global__ void stringMergeMulti	(	T *	A_keys,
		T *	A_keys_out,
		T *	A_values,
		T *	A_values_out,
		T *	stringValues,
		int	subPartitions,
		int	numBlocks,
		unsigned int *	partitionBeginA,
		unsigned int *	partitionSizeA,
		unsigned int *	partitionBeginB,
		unsigned int *	partitionSizeB,
		int	entirePartitionSize,
		int	step,
		size_t	size,
		size_t	stringSize,
		unsigned char	termC
	)

Main merge kernel where multiple CUDA blocks cooperate to merge a partition(s)

Parameters

[in]	A_keys,A_values	First four characters (input), and addresses of our inputs
[out]	A_keys_out,A_values_out	First four characters, and addresses for our outputs(ping-pong)
[in]	stringValues	string array for tie breaks
[out]	subPartitions,numBlocks	Number of splits per partitions and number of partitions respectively
[in]	partitionBeginA,partitionSizeA	Where partitions begin and how large they are for Segment A
[in]	partitionBeginB,partitionSizeB	Where partitions begin and how large they are for Segment B
[in]	entirePartitionSize	The maximum length of a partition
[in]	step	Number of merge cycles done
[in]	size	Number of total strings being sorted
[in]	stringSize	Length of string array
[in]	termC	Termination character for the strings

template<class T >

__global__ void crpcrKernel	(	T *	d_a,
		T *	d_b,
		T *	d_c,
		T *	d_d,
		T *	d_x,
		unsigned int	systemSizeOriginal,
		unsigned int	iterations
	)

Hybrid CR-PCR Tridiagonal linear system solver (CRPCR)

This kernel solves a tridiagonal linear system using a hybrid CR-PCR algorithm. The solver first reduces the system size using cyclic reduction, then solves the intermediate system using parallel cyclic reduction to reduce shared memory bank conflicts and algorithmic steps, and finally switches back to cyclic reduction to solve all unknowns.

Parameters

[out]	d_x	Solution vector
[in]	d_a	Lower diagonal
[in]	d_b	Main diagonal
[in]	d_c	Upper diagonal
[in]	d_d	Right hand side
[in]	systemSizeOriginal	The size of each system
[in]	iterations	The computed number of PCR iterations

template<class T >

__global__ void vectorAddConstant	(	T *	d_vector,
		T	constant,
		int	n,
		int	baseIndex
	)

Adds a constant value to all values in the input d_vector.

Each thread adds two pairs of elements.

Todo:: Test this function – it is currently not yet used.

Parameters

[in,out]	d_vector	The array of elements to be modified
[in]	constant	The constant value to be added to elements of d_vector
[in]	n	The number of elements in the d_vector to be modified
[in]	baseIndex	An optional offset to the beginning of the elements in the input array to be processed

template<class T >

__global__ void vectorAddUniform	(	T *	d_vector,
		const T *	d_uniforms,
		int	numElements,
		int	blockOffset,
		int	baseIndex
	)

Add a uniform value to each data element of an array.

This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds two pairs of values.

Parameters

[out]	d_vector	The d_vector whose values will have the uniform added
[in]	d_uniforms	The array of uniform values (one per CTA)
[in]	numElements	The number of elements in d_vector to process
[in]	blockOffset	an optional offset to the beginning of this block's data.
[in]	baseIndex	an optional offset to the beginning of the array within d_vector.

template<class T , class Oper , int elementsPerThread, bool fullBlocks>

__global__ void vectorAddUniform4	(	T *	d_vector,
		const T *	d_uniforms,
		int	numElements,
		int	vectorRowPitch,
		int	uniformRowPitch,
		int	blockOffset,
		int	baseIndex
	)

Add a uniform value to each data element of an array (vec4 version)

This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds the uniform value to eight values in d_vector.

Parameters

[out]	d_vector	The d_vector whose values will have the uniform added
[in]	d_uniforms	The array of uniform values (one per CTA)
[in]	numElements	The number of elements in d_vector to process
[in]	vectorRowPitch	For 2D arrays, the pitch (in elements) of the rows of d_vector.
[in]	uniformRowPitch	For 2D arrays, the pitch (in elements) of the rows of d_uniforms.
[in]	blockOffset	an optional offset to the beginning of this block's data.
[in]	baseIndex	an optional offset to the beginning of the array within d_vector.

template<class T >

__global__ void vectorAddVector	(	T *	d_vectorA,
		const T *	d_vectorB,
		int	numElements,
		int	baseIndex
	)

Adds together two vectors.

Each thread adds two pairs of elements.

Todo:: Test this function – it is currently not yet used.

Parameters

[out]	d_vectorA	The left operand array and the result
[in]	d_vectorB	The right operand array
[in]	numElements	The number of elements in the vectors to be added.
[in]	baseIndex	An optional offset to the beginning of the elements in the input arrays to be processed

template<class T , class Oper , bool isLastBlockFull>

__global__ void vectorSegmentedAddUniform4	(	T *	d_vector,
		const T *	d_uniforms,
		const unsigned int *	d_maxIndices,
		unsigned int	numElements,
		int	blockOffset,
		int	baseIndex
	)

Add a uniform value to data elements of an array (vec4 version)

This function reads one value per CTA from d_uniforms into shared memory and adds that value to values "owned" by the CTA in d_vector. The uniform value is added to only those values "owned" by the CTA which have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX it adds the uniform to all values "owned" by the CTA. Each thread adds the uniform value to eight values in d_vector.

Parameters

[out]	d_vector	The d_vector whose values will have the uniform added
[in]	d_uniforms	The array of uniform values (one per CTA)
[in]	d_maxIndices	The array of maximum indices (one per CTA). This is index upto which the uniform would be added. If this is UINT_MAX the uniform is added to all elements of the CTA. This index is 1-based.
[in]	numElements	The number of elements in d_vector to process
[in]	blockOffset	an optional offset to the beginning of this block's data.
[in]	baseIndex	an optional offset to the beginning of the array within d_vector.

template<class T , class Oper , bool isLastBlockFull>

__global__ void vectorSegmentedAddUniformToRight4	(	T *	d_vector,
		const T *	d_uniforms,
		const unsigned int *	d_minIndices,
		unsigned int	numElements,
		int	blockOffset,
		int	baseIndex
	)

Add a uniform value to data elements of an array (vec4 version)

This function reads one value per CTA from d_uniforms into shared memory and adds that value to values "owned" by the CTA in d_vector. The uniform value is added to only those values "owned" by the CTA which have an index greater than d_minIndex. If d_minIndex for that CTA is 0 it adds the uniform to all values "owned" by the CTA. Each thread adds the uniform value to eight values in d_vector.

Parameters

[out]	d_vector	The d_vector whose values will have the uniform added
[in]	d_uniforms	The array of uniform values (one per CTA)
[in]	d_minIndices	The array of minimum indices (one per CTA). The uniform is added to the right of this index (that is, to every index that is greater than this index). If this is 0, the uniform is added to all elements of the CTA. This index is 1-based to prevent overloading of what 0 means. In our case it means absence of a flag. But if the first element of a CTA has flag the index will also be 0. Hence we use 1-based indices so the index is 1 in the latter case.
[in]	numElements	The number of elements in d_vector to process
[in]	blockOffset	an optional offset to the beginning of this block's data.
[in]	baseIndex	an optional offset to the beginning of the array within d_vector.

Functions

Compact Functions

Compress Functions

ListRank Functions

MergeSort Functions

RadixSort Functions

Rand Functions

Reduce Functions

Suffix Array Functions

Scan Functions

Segmented scan Functions

Sparse Matrix-Vector multiply Functions

Tridiagonal functions

Vector Functions

Detailed Description

Function Documentation