Modules
	Hash Table Data Structures and Constants

Classes
class	CudaHT::CuckooHashing::HashTable
	Basic hash table that stores one value for each key. More...

Compact Functions
void	calculateCompactLaunchParams (const unsigned int numElements, unsigned int &numThreads, unsigned int &numBlocks, unsigned int &numEltsPerBlock)
	Calculate launch parameters for compactArray(). More...

template<class T >
void	compactArray (T d_out, size_t d_numValidElements, const T d_in, const unsigned int d_isValid, size_t numElements, const CUDPPCompactPlan *plan)
	Compact the non-zero elements of an array. More...

void	allocCompactStorage (CUDPPCompactPlan *plan)
	Allocate intermediate arrays used by cudppCompact(). More...

void	freeCompactStorage (CUDPPCompactPlan *plan)
	Deallocate intermediate storage used by cudppCompact(). More...

void	cudppCompactDispatch (void d_out, size_t d_numValidElements, const void d_in, const unsigned int d_isValid, size_t numElements, const CUDPPCompactPlan *plan)
	Dispatch compactArray for the specified datatype. More...

Compress Functions
void	huffmanEncoding (unsigned int d_hist, unsigned int d_encodeOffset, unsigned int d_compressedSize, unsigned int d_compressed, size_t numElements, const CUDPPCompressPlan *plan)
	Perform Huffman encoding. More...

template<class T >
void	moveToFrontTransform (unsigned char d_mtfIn, unsigned char d_mtfOut, size_t numElements, const T *plan)
	Perform the Move-to-Front Transform (MTF) More...

template<class T >
void	burrowsWheelerTransform (unsigned char d_uncompressed, int d_bwtIndex, unsigned char d_bwtOut, size_t numElements, const T plan)
	Perform the Burrows-Wheeler Transform (BWT) More...

void	burrowsWheelerTransformWrapper (unsigned char d_in, int d_bwtIndex, size_t numElements, const CUDPPCompressPlan *plan)
	Wrapper for calling the Burrows-Wheeler Transform (BWT). More...

void	burrowsWheelerTransformWrapper (unsigned char d_in, int d_bwtIndex, unsigned char d_bwtOut, size_t numElements, const CUDPPBwtPlan plan)
	Wrapper for calling the Burrows-Wheeler Transform (BWT). More...

void	moveToFrontTransformWrapper (size_t numElements, const CUDPPCompressPlan *plan)
	Wrapper for calling the Move-to-Front (MTF) transform. More...

void	moveToFrontTransformWrapper (unsigned char d_in, unsigned char d_mtfOut, size_t numElements, const CUDPPMtfPlan *plan)
	Wrapper for calling the Move-to-Front (MTF) transform. More...

void	allocBwtStorage (CUDPPBwtPlan *plan)
	Allocate intermediate arrays used by BWT. More...

void	allocMtfStorage (CUDPPMtfPlan *plan)
	Allocate intermediate arrays used by MTF. More...

void	allocCompressStorage (CUDPPCompressPlan *plan)
	Allocate intermediate arrays used by compression. More...

void	freeCompressStorage (CUDPPCompressPlan *plan)
	Deallocate intermediate block arrays in a CUDPPCompressPlan object. More...

void	freeBwtStorage (CUDPPBwtPlan *plan)
	Deallocate intermediate block arrays in a CUDPPBwtPlan object. More...

void	freeMtfStorage (CUDPPMtfPlan *plan)
	Deallocate intermediate block arrays in a CUDPPMtfPlan object. More...

void	cudppCompressDispatch (void d_uncompressed, void d_bwtIndex, void d_histSize, void d_hist, void d_encodeOffset, void d_compressedSize, void d_compressed, size_t numElements, const CUDPPCompressPlan plan)
	Dispatch function to perform parallel compression on an array with the specified configuration. More...

void	cudppBwtDispatch (void d_in, void d_out, void d_index, size_t numElements, const CUDPPBwtPlan plan)
	Dispatch function to perform the Burrows-Wheeler transform. More...

void	cudppMtfDispatch (void d_in, void d_out, size_t numElements, const CUDPPMtfPlan *plan)
	Dispatch function to perform the Move-to-Front transform. More...

ListRank Functions
template<typename T >
void	listRank (T d_ranked_values, T d_unranked_values, int d_next_indices, size_t head, size_t numElements, const CUDPPListRankPlan plan)
	Launch list ranking. More...

void	allocListRankStorage (CUDPPListRankPlan *plan)
	Allocate intermediate arrays used by ListRank. More...

void	freeListRankStorage (CUDPPListRankPlan *plan)
	Deallocate intermediate block arrays in a CUDPPListRankPlan object. More...

CUDPPResult	cudppListRankDispatch (void d_ranked_values, void d_unranked_values, void d_next_indices, size_t head, size_t numElements, const CUDPPListRankPlan plan)
	Dispatch function to perform parallel list ranking on a linked-list with the specified configuration. More...

MergeSort Functions
template<typename T >
void	runMergeSort (T pkeys, unsigned int pvals, size_t numElements, const CUDPPMergeSortPlan *plan)
	Performs merge sort utilizing 3 stages: (1) Blocksort, (2) simple merge and (3) multi merge. More...

void	allocMergeSortStorage (CUDPPMergeSortPlan *plan)
	From the programmer-specified sort configuration, creates internal memory for performing the sort. More...

void	freeMergeSortStorage (CUDPPMergeSortPlan *plan)
	Deallocates intermediate memory from allocRadixSortStorage. More...

void	cudppMergeSortDispatch (void keys, void values, size_t numElements, const CUDPPMergeSortPlan *plan)
	Dispatch function to perform a sort on an array with a specified configuration. More...

#define	BLOCKSORT_SIZE 1024

#define	DEPTH 8

MultiSplit Functions
template<typename bucket_t , typename key_type >
void	multisplit_WMS_prescan_function (key_type *d_key_in, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_raw, uint32_t &num_blocks_pre, uint32_t &num_sub_problems, multisplit_context &context)

template<typename bucket_t , typename key_type >
void	multisplit_WMS_pairs_prescan_function (key_type *d_key_in, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_raw, uint32_t &num_blocks_pre, uint32_t &num_sub_problems, multisplit_context &context)

template<typename bucket_t , typename key_type >
void	multisplit_WMS_postscan_function (key_type d_key_in, key_type d_key_out, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_post, multisplit_context &context)

template<typename bucket_t , typename key_type , typename value_type >
void	multisplit_WMS_pairs_postscan_function (key_type d_key_in, value_type d_value_in, key_type d_key_out, value_type d_value_out, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_post, multisplit_context &context)

template<typename bucket_t , typename key_type >
void	multisplit_BMS_prescan_function (key_type *d_key_in, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_raw, uint32_t &num_blocks_pre, uint32_t &num_sub_problems, multisplit_context &context)

template<typename bucket_t , typename key_type >
void	multisplit_BMS_pairs_prescan_function (key_type *d_key_in, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_raw, uint32_t &num_blocks_pre, uint32_t &num_sub_problems, multisplit_context &context)

template<typename bucket_t , typename key_type >
void	multisplit_BMS_postscan_function (key_type d_key_in, key_type d_key_out, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_post, multisplit_context &context)

template<typename bucket_t , typename key_type , typename value_type >
void	multisplit_BMS_pairs_postscan_function (key_type d_key_in, value_type d_value_in, key_type d_key_out, value_type d_value_out, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_post, multisplit_context &context)

void	multisplit_allocate_key_only (size_t num_elements, uint32_t num_buckets, multisplit_context &context)

void	multisplit_allocate_key_value (size_t num_elements, uint32_t num_buckets, multisplit_context &context)

void	multisplit_release_memory (multisplit_context &context)

template<typename key_type , typename bucket_t >
void	multisplit_key_only (key_type d_key_in, key_type d_key_out, size_t num_elements, uint32_t num_buckets, multisplit_context &context, bucket_t bucket_identifier, bool in_place, uint32_t *bucket_offsets=NULL)
	Performs multisplit on keys only. More...

template<typename key_type , typename value_type , typename bucket_t >
void	multisplit_key_value (key_type d_key_in, value_type d_value_in, key_type d_key_out, value_type d_value_out, size_t num_elements, uint32_t num_buckets, multisplit_context &context, bucket_t bucket_identifier, bool in_place, uint32_t *bucket_offsets=NULL)
	Performs multisplit on key-value pairs. More...

cub::CachingDeviceAllocator	g_allocator (true)

template<class T >
void	reducedBitSortKeysOnly (unsigned int d_inp, uint numElements, uint numBuckets, T bucketMapper, const CUDPPMultiSplitPlan plan)
	Performs multisplit on keys only using the reduced-bit sort method. More...

template<class T >
void	reducedBitSortKeyValue (unsigned int d_keys, unsigned int d_values, unsigned int numElements, unsigned int numBuckets, T bucketMapper, const CUDPPMultiSplitPlan *plan)
	Performs multisplit on key-value pairs using a reduced-bit sort. More...

void	allocMultiSplitStorage (CUDPPMultiSplitPlan *plan)
	From the programmer-specified multisplit configuration, creates internal memory for performing the multisplit. Different storage amounts are required depending on the number of buckets. More...

void	freeMultiSplitStorage (CUDPPMultiSplitPlan *plan)
	Deallocates intermediate memory from allocMultiSplitStorage. More...

void	cudppMultiSplitDispatch (unsigned int d_keys, unsigned int d_values, size_t numElements, size_t numBuckets, BucketMappingFunc bucketMappingFunc, const CUDPPMultiSplitPlan *plan)
	Dispatch function to perform multisplit on an array of elements into a number of buckets. More...

#define	MULTISPLIT_WMS_K_ONE_ROLL 8

#define	MULTISPLIT_WMS_K_TWO_ROLL 8

#define	MULTISPLIT_WMS_K_THREE_ROLL 4

#define	MULTISPLIT_WMS_K_FOUR_ROLL 4

#define	MULTISPLIT_WMS_K_FIVE_ROLL 2

#define	MULTISPLIT_WMS_KV_ONE_ROLL 4

#define	MULTISPLIT_WMS_KV_TWO_ROLL 4

#define	MULTISPLIT_WMS_KV_THREE_ROLL 2

#define	MULTISPLIT_WMS_KV_FOUR_ROLL 2

#define	MULTISPLIT_WMS_KV_FIVE_ROLL 2

#define	MULTISPLIT_BMS_K_ONE_ROLL 8

#define	MULTISPLIT_BMS_K_TWO_ROLL 8

#define	MULTISPLIT_BMS_K_THREE_ROLL 4

#define	MULTISPLIT_BMS_K_FOUR_ROLL 4

#define	MULTISPLIT_BMS_K_FIVE_ROLL 4

#define	MULTISPLIT_BMS_KV_ONE_ROLL 4

#define	MULTISPLIT_BMS_KV_TWO_ROLL 4

#define	MULTISPLIT_BMS_KV_THREE_ROLL 2

#define	MULTISPLIT_BMS_KV_FOUR_ROLL 2

#define	MULTISPLIT_BMS_KV_FIVE_ROLL 2

#define	MULTISPLIT_SWITCH_STRATEGY_K 8

#define	MULTISPLIT_SWITCH_STRATEGY_KV 8

#define	MULTISPLIT_NUM_WARPS 8

#define	MULTISPLIT_LOG_WARPS 3

#define	MULTISPLIT_WARP_WIDTH 32

#define	MULTISPLIT_TRHEADS_PER_BLOCK (MULTISPLIT_WARP_WIDTH * MULTISPLIT_NUM_WARPS)

RadixSort Functions
void	allocRadixSortStorage (CUDPPRadixSortPlan *plan)
	From the programmer-specified sort configuration, creates internal memory for performing the sort. More...

void	freeRadixSortStorage (CUDPPRadixSortPlan *plan)
	Deallocates intermediate memory from allocRadixSortStorage. More...

template<typename T >
void	runSort (T pkeys, unsigned int pvals, size_t numElements, const CUDPPRadixSortPlan *plan)

void	cudppRadixSortDispatch (void keys, void values, size_t numElements, const CUDPPRadixSortPlan *plan)
	Dispatch function to perform a sort on an array with a specified configuration. More...

Suffix Array Functions
void	KeyValueSort (unsigned int num_elements, unsigned int d_keys, unsigned int d_values)
	Radix Sort kernel from NVlab cub library. More...

void	ComputeSA (unsigned int d_str, unsigned int d_keys_sa, size_t str_length, mgpu::CudaContext &context, CUDPPSaPlan *plan, unsigned int offset, unsigned int stage)
	Perform Suffix Array (SA) using skew algorithm. More...

void	allocSaStorage (CUDPPSaPlan *plan)
	Allocate intermediate arrays used by suffix array. More...

void	freeSaStorage (CUDPPSaPlan *plan)
	Deallocate intermediate block arrays in a CUDPPSaPlan object. More...

void	cudppSuffixArrayDispatch (void d_str, unsigned int d_keys_sa, size_t d_str_length, CUDPPSaPlan *plan)
	Dispatch function to perform parallel suffix array on a string with the specified configuration. More...

Scan Functions
template<class T , bool isBackward, bool isExclusive, class Op >
void	scanArrayRecursive (T d_out, const T d_in, T *d_blockSums, size_t numElements, size_t numRows, const size_t rowPitches, int level)
	Perform recursive scan on arbitrary size arrays. More...

void	allocScanStorage (CUDPPScanPlan *plan)
	Allocate intermediate arrays used by scan. More...

void	freeScanStorage (CUDPPScanPlan *plan)
	Deallocate intermediate block sums arrays in a CUDPPScanPlan object. More...

template<typename T , bool isBackward, bool isExclusive>
void	cudppScanDispatchOperator (void d_out, const void d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan)

template<bool isBackward, bool isExclusive>
void	cudppScanDispatchType (void d_out, const void d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan)

void	cudppScanDispatch (void d_out, const void d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan)
	Dispatch function to perform a scan (prefix sum) on an array with the specified configuration. More...

StringSort Functions
void	dotAdd (unsigned int d_address, unsigned int numSpaces, unsigned int *packedAddress, size_t numElements, size_t stringArrayLength)

void	calculateAlignedOffsets (unsigned int d_address, unsigned int numSpaces, unsigned char *d_stringVals, unsigned char termC, size_t numElements, size_t stringArrayLength)

void	packStrings (unsigned int packedStrings, unsigned char d_stringVals, unsigned int d_keys, unsigned int packedAddress, unsigned int *address, size_t numElements, size_t stringArrayLength, unsigned char termC)

void	unpackStrings (unsigned int packedAddress, unsigned int packedAddressRef, unsigned int address, unsigned int addressRef, size_t numElements)

void	runStringSort (unsigned int pkeys, unsigned int pvals, unsigned int stringVals, size_t numElements, size_t stringArrayLength, unsigned char termC, const CUDPPStringSortPlan plan)
	Performs merge sor utilzing three stages. (1) Blocksort, (2) simple merge and (3) multi merge on a set of strings. More...

void	allocStringSortStorage (CUDPPStringSortPlan *plan)
	From the programmer-specified sort configuration, creates internal memory for performing the sort. More...

void	freeStringSortStorage (CUDPPStringSortPlan *plan)
	Deallocates intermediate memory from allocStringSortStorage. More...

void	cudppStringSortDispatch (unsigned int keys, unsigned int values, unsigned int stringVals, size_t numElements, size_t stringArrayLength, unsigned char termC, const CUDPPStringSortPlan plan)
	Dispatch function to perform a sort on an array with a specified configuration. More...

#define	BLOCKSORT_SIZE 1024

#define	DEPTH 8

Tridiagonal functions
template<typename T >
unsigned int	crpcrSharedSize (unsigned int systemSizeOriginal)

template<typename T >
void	crpcr (T d_a, T d_b, T d_c, T d_d, T *d_x, unsigned int systemSizeOriginal, unsigned int numSystems)
	Hybrid CR-PCR solver (CRPCR) More...

CUDPPResult	cudppTridiagonalDispatch (void d_a, void d_b, void d_c, void d_d, void d_x, int systemSize, int numSystems, const CUDPPTridiagonalPlan plan)
	Dispatches the tridiagonal function based on the plan. More...

Detailed Description

The CUDPP Application-Level API contains functions that run on the host CPU and invoke GPU routines in the CUDPP Kernel-Level API. Application-Level API functions are used by CUDPP Public Interface functions to implement CUDPP's core functionality.

Function Documentation

void calculateCompactLaunchParams	(	const unsigned int	numElements,
		unsigned int &	numThreads,
		unsigned int &	numBlocks,
		unsigned int &	numEltsPerBlock
	)

Calculate launch parameters for compactArray().

Calculates the block size and number of blocks from the total number of elements and the maximum threads per block. Called by compactArray().

The calculation is pretty straightforward - the number of blocks is calculated by dividing the number of input elements by the product of the number of threads in each CTA and the number of elements each thread will process. numThreads and numEltsPerBlock are also simple to calculate. Please note that in cases where numElements is not an exact multiple of SCAN_ELTS_PER_THREAD * CTA_SIZE we would have threads which do nothing or have a thread which will process less than SCAN_ELTS_PER_THREAD elements.

Parameters

[in]	numElements	Number of elements to sort
[out]	numThreads	Number of threads in each block
[out]	numBlocks	Number of blocks
[out]	numEltsPerBlock	Number of elements processed per block

template<class T >

void compactArray	(	T *	d_out,
		size_t *	d_numValidElements,
		const T *	d_in,
		const unsigned int *	d_isValid,
		size_t	numElements,
		const CUDPPCompactPlan *	plan
	)

Compact the non-zero elements of an array.

Given an input array d_in, compactArray() outputs a compacted version which does not have null (zero) elements. Also ouputs the number of non-zero elements in the compacted array. Called by cudppCompactDispatch().

The algorithm is straightforward, involving two steps (most of the complexity is hidden in scan, invoked with cudppScanDispatch() ).

scanArray() performs a prefix sum on d_isValid to compute output indices.
compactData() takes d_in and an intermediate array of output indices as input and writes the values with valid flags in d_isValid into d_out using the output indices.

Parameters

[out]	d_out	Array of compacted non-null elements
[out]	d_numValidElements	Pointer to unsigned int to store number of non-null elements
[in]	d_in	Input array
[out]	d_isValid	Array of flags, 1 for each non-null element, 0 for each null element. Same length as d_in
[in]	numElements	Number of elements in input array
[in]	plan	Pointer to the plan object used for this compact

void allocCompactStorage ( CUDPPCompactPlan * plan )

Allocate intermediate arrays used by cudppCompact().

In addition to the internal CUDPPScanPlan contained in CUDPPCompactPlan, CUDPPCompact also needs a temporary device array of output indices, which is allocated by this function.

Parameters

plan	Pointer to CUDPPCompactPlan object within which intermediate storage is allocated.

void freeCompactStorage ( CUDPPCompactPlan * plan )

Deallocate intermediate storage used by cudppCompact().

Deallocates the output indices array allocated by allocCompactStorage().

Parameters

plan	Pointer to CUDPPCompactPlan object initialized by allocCompactStorage().

void cudppCompactDispatch	(	void *	d_out,
		size_t *	d_numValidElements,
		const void *	d_in,
		const unsigned int *	d_isValid,
		size_t	numElements,
		const CUDPPCompactPlan *	plan
	)

Dispatch compactArray for the specified datatype.

A thin wrapper on top of compactArray which calls compactArray() for the data type specified in config. This is the app-level interface to compact used by cudppCompact().

Parameters

[out]	d_out	Compacted array of non-zero elements
[out]	d_numValidElements	Pointer to an unsigned int to store the number of non-zero elements
[in]	d_in	Input array
[in]	d_isValid	Array of boolean valid flags with same length as d_in
[in]	numElements	Number of elements to compact
[in]	plan	Pointer to plan object for this compact

void huffmanEncoding	(	unsigned int *	d_hist,
		unsigned int *	d_encodeOffset,
		unsigned int *	d_compressedSize,
		unsigned int *	d_compressed,
		size_t	numElements,
		const CUDPPCompressPlan *	plan
	)

Perform Huffman encoding.

Performs Huffman encoding on the input data stream. The input data stream is the output data stream from the previous stage (MTF) in our compress stream.

The input is given by the output of the Move-to-Front transform (MTF). There are a few things that need to be store along with the compressed data. We also store the word offset of the compressed data stream because our data is compressed into indepedent blocks (word granularity) so that they can be encoded and decoded in parallel. The number of independent blocks is HUFF_THREADS_PER_BLOCK*HUFF_WORK_PER_THREAD.

Parameters

[out]	d_hist	Histogram array of the input data stream used for decoding.
[out]	d_encodeOffset	An array of the word offsets of the independent compressed data blocks.
[out]	d_compressedSize	Pointer to the total size in words of all compressed data blocks combined.
[out]	d_compressed	A pointer to the compressed data blocks.
[in]	numElements	Total number of input elements to compress.
[in]	plan	Pointer to the plan object used for this compress.

template<class T >

void moveToFrontTransform	(	unsigned char *	d_mtfIn,
		unsigned char *	d_mtfOut,
		size_t	numElements,
		const T *	plan
	)

Perform the Move-to-Front Transform (MTF)

Performs a Move-to-Front (MTF) transform on the input data stream. The MTF transform is the second stage in our compress pipeline. The MTF manipulates the input data stream to improve the performance of entropy encoding.

Parameters

[in]	d_mtfIn	An array of the input data stream to perform the MTF transform on.
[out]	d_mtfOut	An array to store the output of the MTF transform.
[in]	numElements	Total number of input elements of the MTF transform.
[in]	plan	Pointer to the plan object used for this MTF transform.

template<class T >

void burrowsWheelerTransform	(	unsigned char *	d_uncompressed,
		int *	d_bwtIndex,
		unsigned char *	d_bwtOut,
		size_t	numElements,
		const T *	plan
	)

Perform the Burrows-Wheeler Transform (BWT)

Performs the Burrows-Wheeler Transform (BWT) on a given character string. The BWT is an algorithm which is commonly used in compression applications, mainly bzip2. The BWT orders the characters in such a way that the output tends to have many long runs of repeated characters. This bodes well for later stages in compression pipelines which perform better with repeated characters.

Parameters

[in]	d_uncompressed	A char array of the input data stream to perform the BWT on.
[out]	d_bwtIndex	The index at which the original string in the BWT sorts to.
[out]	d_bwtOut	An array to store the output of the BWT.
[in]	numElements	Total number of input elements of the BWT.
[in]	plan	Pointer to the plan object used for this BWT.

void burrowsWheelerTransformWrapper	(	unsigned char *	d_in,
		int *	d_bwtIndex,
		size_t	numElements,
		const CUDPPCompressPlan *	plan
	)

Wrapper for calling the Burrows-Wheeler Transform (BWT).

This is a wrapper function for calling the BWT. This wrapper is used internally via the compress application to call burrowsWheelerTransform().

Parameters

[in]	d_in	A char array of the input data stream to perform the BWT on.
[out]	d_bwtIndex	The index at which the original string in the BWT sorts to.
[in]	numElements	Total number of input elements to the compress stream.
[in]	plan	Pointer to the plan object used for this compress.

void burrowsWheelerTransformWrapper	(	unsigned char *	d_in,
		int *	d_bwtIndex,
		unsigned char *	d_bwtOut,
		size_t	numElements,
		const CUDPPBwtPlan *	plan
	)

Wrapper for calling the Burrows-Wheeler Transform (BWT).

This is a wrapper function for calling the BWT. This wrapper is used internally via the BWT primitive to call burrowsWheelerTransform().

Parameters

[in]	d_in	A char array of the input data stream to perform the BWT on.
[out]	d_bwtIndex	The index at which the original string in the BWT sorts to.
[out]	d_bwtOut	An array to store the output of the BWT.
[in]	numElements	Total number of input elements to the BWT.
[in]	plan	Pointer to the plan object used for this BWT.

void moveToFrontTransformWrapper	(	size_t	numElements,
		const CUDPPCompressPlan *	plan
	)

Wrapper for calling the Move-to-Front (MTF) transform.

This is a wrapper function for calling the MTF. This wrapper is used internally via the compress application to call moveToFrontTransform().

Parameters

[in]	numElements	Total number of input elements to the MTF transform.
[in]	plan	Pointer to the plan object used for this compress.

void moveToFrontTransformWrapper	(	unsigned char *	d_in,
		unsigned char *	d_mtfOut,
		size_t	numElements,
		const CUDPPMtfPlan *	plan
	)

Wrapper for calling the Move-to-Front (MTF) transform.

This is a wrapper function for calling the MTF. This wrapper is used internally via the MTF primitive to call moveToFrontTransform().

Parameters

[in]	d_in	An input char array to perform the MTF on.
[in]	d_mtfOut	An output char array to store the MTF transformed stream.
[in]	numElements	Total number of input elements to the MTF transform.
[in]	plan	Pointer to the plan object used for this MTF.

void allocBwtStorage ( CUDPPBwtPlan * plan )

Allocate intermediate arrays used by BWT.

Parameters

[in,out] plan Pointer to CUDPPBwtPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.

void allocMtfStorage ( CUDPPMtfPlan * plan )

Allocate intermediate arrays used by MTF.

Parameters

[in,out] plan Pointer to CUDPPMtfPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.

void allocCompressStorage ( CUDPPCompressPlan * plan )

Allocate intermediate arrays used by compression.

Parameters

[in,out] plan Pointer to CUDPPCompressPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.

void freeCompressStorage ( CUDPPCompressPlan * plan )

Deallocate intermediate block arrays in a CUDPPCompressPlan object.

Parameters

[in,out] plan Pointer to CUDPPCompressPlan object initialized by allocCompressStorage().

void freeBwtStorage ( CUDPPBwtPlan * plan )

Deallocate intermediate block arrays in a CUDPPBwtPlan object.

Parameters

[in,out] plan Pointer to CUDPPBwtPlan object initialized by allocBwtStorage().

void freeMtfStorage ( CUDPPMtfPlan * plan )

Deallocate intermediate block arrays in a CUDPPMtfPlan object.

Parameters

[in,out] plan Pointer to CUDPPMtfPlan object initialized by allocMtfStorage().

void cudppCompressDispatch	(	void *	d_uncompressed,
		void *	d_bwtIndex,
		void *	d_histSize,
		void *	d_hist,
		void *	d_encodeOffset,
		void *	d_compressedSize,
		void *	d_compressed,
		size_t	numElements,
		const CUDPPCompressPlan *	plan
	)

Dispatch function to perform parallel compression on an array with the specified configuration.

Parameters

[in]	d_uncompressed	Uncompressed data
[out]	d_bwtIndex	BWT Index
[out]	d_histSize	Histogram size
[out]	d_hist	Histogram
[out]	d_encodeOffset	Encoded offset table
[out]	d_compressedSize	Size of compressed data
[out]	d_compressed	Compressed data
[in]	numElements	Number of elements to compress
[in]	plan	Pointer to CUDPPCompressPlan object containing compress options and intermediate storage

void cudppBwtDispatch	(	void *	d_in,
		void *	d_out,
		void *	d_index,
		size_t	numElements,
		const CUDPPBwtPlan *	plan
	)

Dispatch function to perform the Burrows-Wheeler transform.

Parameters

[in]	d_in	Input data
[out]	d_out	Transformed data
[out]	d_index	BWT Index
[in]	numElements	Number of elements to compress
[in]	plan	Pointer to CUDPPBwtPlan object containing compress options and intermediate storage

void cudppMtfDispatch	(	void *	d_in,
		void *	d_out,
		size_t	numElements,
		const CUDPPMtfPlan *	plan
	)

Dispatch function to perform the Move-to-Front transform.

Parameters

[in]	d_in	Input data
[out]	d_out	Transformed data
[in]	numElements	Number of elements to compress
[in]	plan	Pointer to CUDPPMtfPlan object containing compress options and intermediate storage

template<typename T >

void listRank	(	T *	d_ranked_values,
		T *	d_unranked_values,
		int *	d_next_indices,
		size_t	head,
		size_t	numElements,
		const CUDPPListRankPlan *	plan
	)

Launch list ranking.

Given two inputs arrays, d_unranked_values and d_next_indices, listRank() outputs a ranked version of the unranked values by traversing the next indices. The head index is head. Called by cudppListRankDispatch().

Parameters

[out]	d_ranked_values	Ranked values array
[in]	d_unranked_values	Unranked values array
[in]	d_next_indices	Next indices array
[in]	head	Head pointer index
[in]	numElements	Number of nodes values to rank
[in]	plan	Pointer to CUDPPListRankPlan object containing list ranking options and intermediate storage

void allocListRankStorage ( CUDPPListRankPlan * plan )

Allocate intermediate arrays used by ListRank.

CUDPPListRank needs temporary device arrays to store next indices so that it does not overwrite the original array.

Parameters

[in,out] plan Pointer to CUDPPListRankPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.

void freeListRankStorage ( CUDPPListRankPlan * plan )

Deallocate intermediate block arrays in a CUDPPListRankPlan object.

Deallocates the output indices array allocated by allocListRankStorage().

Parameters

[in,out] plan Pointer to CUDPPListRankPlan object initialized by allocListRankStorage().

CUDPPResult cudppListRankDispatch	(	void *	d_ranked_values,
		void *	d_unranked_values,
		void *	d_next_indices,
		size_t	head,
		size_t	numElements,
		const CUDPPListRankPlan *	plan
	)

Dispatch function to perform parallel list ranking on a linked-list with the specified configuration.

A wrapper on top of listRank which calls listRank() for the data type specified in config. This is the app-level interface to list ranking used by cudppListRank().

Parameters

[out]	d_ranked_values	Ranked values array
[in]	d_unranked_values	Unranked values array
[in]	d_next_indices	Next indices array
[in]	head	Head pointer index
[in]	numElements	Number of nodes values to rank
[in]	plan	Pointer to CUDPPListRankPlan object containing list ranking options and intermediate storage

Returns: CUDPPResult indicating success or error condition

template<typename T >

void runMergeSort	(	T *	pkeys,
		unsigned int *	pvals,
		size_t	numElements,
		const CUDPPMergeSortPlan *	plan
	)

Performs merge sort utilizing 3 stages: (1) Blocksort, (2) simple merge and (3) multi merge.

Parameters

[in,out]	pkeys	Keys to be sorted.
[in,out]	pvals	Associated values to be sorted
[in]	numElements	Number of elements in the sort.
[in]	plan	Configuration information for mergesort.

void allocMergeSortStorage ( CUDPPMergeSortPlan * plan )

From the programmer-specified sort configuration, creates internal memory for performing the sort.

Parameters

[in] plan Pointer to CUDPPMergeSortPlan object

void freeMergeSortStorage ( CUDPPMergeSortPlan * plan )

Deallocates intermediate memory from allocRadixSortStorage.

Parameters

[in] plan Pointer to CUDPPMergeSortPlan object

void cudppMergeSortDispatch	(	void *	keys,
		void *	values,
		size_t	numElements,
		const CUDPPMergeSortPlan *	plan
	)

Dispatch function to perform a sort on an array with a specified configuration.

This is the dispatch routine which calls mergeSort...() with appropriate template parameters and arguments as specified by the plan. Currently only sorts keys of type int, unsigned int, and float.

Parameters

[in,out]	keys	Keys to be sorted.
[in,out]	values	Associated values to be sorted (through keys).
[in]	numElements	Number of elements in the sort.
[in]	plan	Configuration information for mergeSort.

template<typename key_type , typename bucket_t >

void multisplit_key_only	(	key_type *	d_key_in,
		key_type *	d_key_out,
		size_t	num_elements,
		uint32_t	num_buckets,
		multisplit_context &	context,
		bucket_t	bucket_identifier,
		bool	in_place,
		uint32_t *	bucket_offsets = `NULL`
	)

Performs multisplit on keys only.

This function performs multisplit on a list of keys for a number of buckets less than or equal to 32. If the number of buckets is less than a threshold, a warp-level multisplit is used. If the number of buckets is greater than the threshold, a block-level multisplit is used. This function also supports copying the results of multisplit back into the input array. In addition, the offset indices marking the locations of the buckets in the result array can optionally be saved.

Parameters

[in,out]	d_key_in	Input keys to be multisplit.
[out]	d_key_out	Output keys after multisplit.
[in]	num_elements	Number of elements.
[in]	num_buckets	Number of buckets.
[in]	context	Intermediate data storage for multisplit.
[in]	bucket_identifier	Functor to map an element to a bucket number.
[in]	in_place	Flag to indicate if results are copied back to the input.
[out]	bucket_offsets	Optional output list of bucket indices.

template<typename key_type , typename value_type , typename bucket_t >

void multisplit_key_value	(	key_type *	d_key_in,
		value_type *	d_value_in,
		key_type *	d_key_out,
		value_type *	d_value_out,
		size_t	num_elements,
		uint32_t	num_buckets,
		multisplit_context &	context,
		bucket_t	bucket_identifier,
		bool	in_place,
		uint32_t *	bucket_offsets = `NULL`
	)

Performs multisplit on key-value pairs.

This function performs multisplit on a list of keys and a list of values for a number of buckets less than or equal to 32. If the number of buckets is less than a threshold, a warp-level multisplit is used. If the number of buckets is greater than the threshold, a block-level multisplit is used. This function also supports copying the results of multisplit back into the key and value input arrays. In addition, the offset indices marking the locations of the buckets in the result arrays can optionally be saved.

Parameters

[in,out]	d_key_in	Input keys to be multisplit.
[in,out]	d_value_in	Input values to be multisplit along with keys.
[out]	d_key_out	Output keys after multisplit.
[out]	d_value_out	Output keys after multisplit.
[in]	num_elements	Number of elements.
[in]	num_buckets	Number of buckets.
[in]	context	Intermediate data storage for multisplit.
[in]	bucket_identifier	Functor to map an element to a bucket number.
[in]	in_place	Flag to indicate if results are copied back to inputs.
[out]	bucket_offsets	Optional output list of bucket indices.

template<class T >

void reducedBitSortKeysOnly	(	unsigned int *	d_inp,
		uint	numElements,
		uint	numBuckets,
		T	bucketMapper,
		const CUDPPMultiSplitPlan *	plan
	)

Performs multisplit on keys only using the reduced-bit sort method.

This function uses radix sort to perform a multisplit. It is suitable when the number of buckets is large.

Parameters

[in,out]	d_inp	Keys to be multisplit.
[in]	numElements	Number of elements.
[in]	numBuckets	Number of buckets.
[in]	bucketMapper	Functor that maps an element to a bucket number.
[in]	plan	Configuration plan for multisplit.

template<class T >

void reducedBitSortKeyValue	(	unsigned int *	d_keys,
		unsigned int *	d_values,
		unsigned int	numElements,
		unsigned int	numBuckets,
		T	bucketMapper,
		const CUDPPMultiSplitPlan *	plan
	)

Performs multisplit on key-value pairs using a reduced-bit sort.

This function uses radix sort to perform a multisplit on a list of keys and a list of values. It is suitable when the number of buckets is large.

Parameters

[in,out]	d_keys	Keys to be multisplit.
[in,out]	d_values	Associated values to be multisplit
[in]	numElements	Number of key-value pairs.
[in]	numBuckets	Number of buckets.
[in]	bucketMapper	Functor that maps an element to a bucket number.
[in]	plan	Configuration information for multisplit.

void allocMultiSplitStorage ( CUDPPMultiSplitPlan * plan )

From the programmer-specified multisplit configuration, creates internal memory for performing the multisplit. Different storage amounts are required depending on the number of buckets.

Parameters

[in] plan Pointer to CUDPPMultiSplitPlan object

void freeMultiSplitStorage ( CUDPPMultiSplitPlan * plan )

Deallocates intermediate memory from allocMultiSplitStorage.

Parameters

[in] plan Pointer to CUDPPMultiSplitPlan object

void cudppMultiSplitDispatch	(	unsigned int *	d_keys,
		unsigned int *	d_values,
		size_t	numElements,
		size_t	numBuckets,
		BucketMappingFunc	bucketMappingFunc,
		const CUDPPMultiSplitPlan *	plan
	)

Dispatch function to perform multisplit on an array of elements into a number of buckets.

This is the dispatch routine which calls multiSplit...() with appropriate parameters, including the bucket mapping function specified by plan's configuration.

Currently only splits unsigned integers.

Parameters

[in,out]	keys	Keys to be split.
[in,out]	values	Optional associated values to be split (through keys), can be NULL.
[in]	numElements	Number of elements to be split.
[in]	plan	Configuration information for multiSplit.

void allocRadixSortStorage ( CUDPPRadixSortPlan * plan )

From the programmer-specified sort configuration, creates internal memory for performing the sort.

Parameters

[in] plan Pointer to CUDPPRadixSortPlan object

void freeRadixSortStorage ( CUDPPRadixSortPlan * plan )

Deallocates intermediate memory from allocRadixSortStorage.

Parameters

[in] plan Pointer to CUDPPRadixSortPlan object

void cudppRadixSortDispatch	(	void *	keys,
		void *	values,
		size_t	numElements,
		const CUDPPRadixSortPlan *	plan
	)

Dispatch function to perform a sort on an array with a specified configuration.

This is the dispatch routine which calls radixSort...() with appropriate template parameters and arguments as specified by the plan.

Parameters

[in,out]	keys	Keys to be sorted.
[in,out]	values	Associated values to be sorted (through keys).
[in]	numElements	Number of elements in the sort.
[in]	plan	Configuration information for RadixSort.

void KeyValueSort	(	unsigned int	num_elements,
		unsigned int *	d_keys,
		unsigned int *	d_values
	)

Radix Sort kernel from NVlab cub library.

Parameters

[in]	num_elements	Number of elements to sort.
[in]	d_keys	Key values of the elements in the array to be sorted.
[in,out]	d_values	Positions of the elements in the array.

void ComputeSA	(	unsigned int *	d_str,
		unsigned int *	d_keys_sa,
		size_t	str_length,
		mgpu::CudaContext &	context,
		CUDPPSaPlan *	plan,
		unsigned int	offset,
		unsigned int	stage
	)

Perform Suffix Array (SA) using skew algorithm.

Performs recursive skew kernel on a given character string. A suffix array is a sorted array of all suffixes of a string. Skew algorithm is a linear-time algorithm based on divde and conquer. The SA of a string can be used as an index to quickly locate every occurrence of a substring pattern within the string. Suffix sorting algorithms can be used to compute the Burrows-Wheeler Transform(BWT). The BWT requires sorting of all cyclic permutations of a string, thus can be computed in linear time by using a suffix array of the string.

Parameters

[in]	d_str	An unsigned int array of the input data stream to perform the SA on.
[out]	d_keys_sa	An array to store the output of the SA.
[in]	str_length	Total number of input elements.
[in]	context	Context format required by mgpu functions.
[in,out]	plan	Pointer to the plan object used for this suffix array.
[in,out]	offset	Offset to move head pointer to find the memory for each iteration.
[in,out]	stage	Stage for each iteration.

void allocSaStorage ( CUDPPSaPlan * plan )

Allocate intermediate arrays used by suffix array.

Parameters

[in,out] plan Pointer to CUDPPSaPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.

void freeSaStorage ( CUDPPSaPlan * plan )

Deallocate intermediate block arrays in a CUDPPSaPlan object.

Parameters

[in,out] plan Pointer to CUDPPSaPlan object initialized by allocSaStorage().

void cudppSuffixArrayDispatch	(	void *	d_str,
		unsigned int *	d_keys_sa,
		size_t	d_str_length,
		CUDPPSaPlan *	plan
	)

Dispatch function to perform parallel suffix array on a string with the specified configuration.

Parameters

[in]	d_str	input string with three $
[out]	d_keys_sa	lexicographically sorted suffix position array
[in]	d_str_length	Number of elements in the string including $
[in]	plan	Pointer to CUDPPSaPlan object containing suffix_array options and intermediate storage

template<class T , bool isBackward, bool isExclusive, class Op >

void scanArrayRecursive	(	T *	d_out,
		const T *	d_in,
		T **	d_blockSums,
		size_t	numElements,
		size_t	numRows,
		const size_t *	rowPitches,
		int	level
	)

Perform recursive scan on arbitrary size arrays.

This is the CPU-side workhorse function of the scan engine. This function invokes the CUDA kernels which perform the scan on individual blocks.

Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans, where each block is scanned by a single CUDA thread block. At each recursive level of the scanArrayRecursive first invokes a kernel to scan all blocks of that level, and if the level has more than one block, it calls itself recursively. On returning from each recursive level, the total sum of each block from the level below is added to all elements of the corresponding block in this level. See "Parallel Prefix Sum (Scan) in CUDA" for more information (see References ).

Template parameter T is the datatype; isBackward specifies backward or forward scan; isExclusive specifies exclusive or inclusive scan, and op specifies the binary associative operator to be used.

Parameters

[out]	d_out	The output array for the scan results
[in]	d_in	The input array to be scanned
[out]	d_blockSums	Array of arrays of per-block sums (one array per recursive level, allocated by allocScanStorage())
[in]	numElements	The number of elements in the array to scan
[in]	numRows	The number of rows in the array to scan
[in]	rowPitches	Array of row pitches (one array per recursive level, allocated by allocScanStorage())
[in]	level	The current recursive level of the scan

void allocScanStorage ( CUDPPScanPlan * plan )

Allocate intermediate arrays used by scan.

Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans, where each block is scanned by a single CUDA thread block. At each recursive level of the scan, we need an array in which to store the total sums of all blocks in that level. This function computes the amount of storage needed and allocates it.

Parameters

plan	Pointer to CUDPPScanPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.

void freeScanStorage ( CUDPPScanPlan * plan )

Deallocate intermediate block sums arrays in a CUDPPScanPlan object.

These arrays must have been allocated by allocScanStorage(), which is called by the constructor of cudppScanPlan().

Parameters

plan	Pointer to CUDPPScanPlan object initialized by allocScanStorage().

void cudppScanDispatch	(	void *	d_out,
		const void *	d_in,
		size_t	numElements,
		size_t	numRows,
		const CUDPPScanPlan *	plan
	)

Dispatch function to perform a scan (prefix sum) on an array with the specified configuration.

This is the dispatch routine which calls scanArrayRecursive() with appropriate template parameters and arguments to achieve the scan as specified in plan.

Parameters

[out]	d_out	The output array of scan results
[in]	d_in	The input array
[in]	numElements	The number of elements to scan
[in]	numRows	The number of rows to scan in parallel
[in]	plan	Pointer to CUDPPScanPlan object containing scan options and intermediate storage

void runStringSort	(	unsigned int *	pkeys,
		unsigned int *	pvals,
		unsigned int *	stringVals,
		size_t	numElements,
		size_t	stringArrayLength,
		unsigned char	termC,
		const CUDPPStringSortPlan *	plan
	)

Performs merge sor utilzing three stages. (1) Blocksort, (2) simple merge and (3) multi merge on a set of strings.

Parameters

[in,out]	pkeys	Keys (first four characters of string) to be sorted.
[in,out]	pvals	Addresses of string locations for tie-breaks
[out]	stringVals	global string value array (four characters stuffed into a uint)
[in]	numElements	Number of elements in the sort.
[in]	stringArrayLength	The size of our string array in uints (4 chars per uint)
[in]	plan	Configuration information for mergesort.
[in]	termC	Termination character for our strings

void allocStringSortStorage ( CUDPPStringSortPlan * plan )

From the programmer-specified sort configuration, creates internal memory for performing the sort.

Parameters

[in] plan Pointer to CUDPPStringSortPlan object

void freeStringSortStorage ( CUDPPStringSortPlan * plan )

Deallocates intermediate memory from allocStringSortStorage.

Parameters

[in] plan Pointer to CUDPStringSortPlan object

void cudppStringSortDispatch	(	unsigned int *	keys,
		unsigned int *	values,
		unsigned int *	stringVals,
		size_t	numElements,
		size_t	stringArrayLength,
		unsigned char	termC,
		const CUDPPStringSortPlan *	plan
	)

Dispatch function to perform a sort on an array with a specified configuration.

This is the dispatch routine which calls stringSort...() with appropriate template parameters and arguments as specified by the plan.

Parameters

[in,out]	keys	Keys (first four chars of string) to be sorted.
[in,out]	values	Address of string values in array of null terminated strings
[in]	stringVals	Global string array
[in]	numElements	Number of elements in the sort.
[in]	stringArrayLength	The size of our string array in uints (4 chars per uint)
[in]	termC	Termination character for our strings
[in]	plan	Configuration information for mergeSort.

template<typename T >

void crpcr	(	T *	d_a,
		T *	d_b,
		T *	d_c,
		T *	d_d,
		T *	d_x,
		unsigned int	systemSizeOriginal,
		unsigned int	numSystems
	)

Hybrid CR-PCR solver (CRPCR)

This is a wrapper function for the GPU CR-PCR kernel.

Parameters

[out]	d_x	Solution vector
[in]	d_a	Lower diagonal
[in]	d_b	Main diagonal
[in]	d_c	Upper diagonal
[in]	d_d	Right hand side
[in]	systemSizeOriginal	The size of the linear system
[in]	numSystems	The number of systems to be solved

CUDPPResult cudppTridiagonalDispatch	(	void *	d_a,
		void *	d_b,
		void *	d_c,
		void *	d_d,
		void *	d_x,
		int	systemSize,
		int	numSystems,
		const CUDPPTridiagonalPlan *	plan
	)

Dispatches the tridiagonal function based on the plan.

This is the dispatch call for the tridiagonal solver in either float or double datatype.

Parameters

[out]	d_x	Solution vector
[in]	d_a	Lower diagonal
[in]	d_b	Main diagonal
[in]	d_c	Upper diagonal
[in]	d_d	Right hand side
[in]	systemSize	The size of the linear system
[in]	numSystems	The number of systems to be solved
[in]	plan	pointer to CUDPPTridiagonalPlan

Returns: CUDPPResult indicating success or error condition

Modules

Classes

Compact Functions

Compress Functions

ListRank Functions

MergeSort Functions

MultiSplit Functions

RadixSort Functions

Suffix Array Functions

Scan Functions

StringSort Functions

Tridiagonal functions

Detailed Description

Function Documentation