CUDPP  2.2
CUDA Data-Parallel Primitives Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Groups Pages
Modules | Classes
CUDPP Application-Level API

Modules

 Hash Table Data Structures and Constants
 

Classes

class  CudaHT::CuckooHashing::HashTable
 Basic hash table that stores one value for each key. More...
 

Compact Functions

void calculateCompactLaunchParams (const unsigned int numElements, unsigned int &numThreads, unsigned int &numBlocks, unsigned int &numEltsPerBlock)
 Calculate launch parameters for compactArray(). More...
 
template<class T >
void compactArray (T *d_out, size_t *d_numValidElements, const T *d_in, const unsigned int *d_isValid, size_t numElements, const CUDPPCompactPlan *plan)
 Compact the non-zero elements of an array. More...
 
void allocCompactStorage (CUDPPCompactPlan *plan)
 Allocate intermediate arrays used by cudppCompact(). More...
 
void freeCompactStorage (CUDPPCompactPlan *plan)
 Deallocate intermediate storage used by cudppCompact(). More...
 
void cudppCompactDispatch (void *d_out, size_t *d_numValidElements, const void *d_in, const unsigned int *d_isValid, size_t numElements, const CUDPPCompactPlan *plan)
 Dispatch compactArray for the specified datatype. More...
 

Compress Functions

void huffmanEncoding (unsigned int *d_hist, unsigned int *d_encodeOffset, unsigned int *d_compressedSize, unsigned int *d_compressed, size_t numElements, const CUDPPCompressPlan *plan)
 Perform Huffman encoding. More...
 
template<class T >
void moveToFrontTransform (unsigned char *d_mtfIn, unsigned char *d_mtfOut, size_t numElements, const T *plan)
 Perform the Move-to-Front Transform (MTF) More...
 
template<class T >
void burrowsWheelerTransform (unsigned char *d_uncompressed, int *d_bwtIndex, unsigned char *d_bwtOut, size_t numElements, const T *plan)
 Perform the Burrows-Wheeler Transform (BWT) More...
 
void burrowsWheelerTransformWrapper (unsigned char *d_in, int *d_bwtIndex, size_t numElements, const CUDPPCompressPlan *plan)
 Wrapper for calling the Burrows-Wheeler Transform (BWT). More...
 
void burrowsWheelerTransformWrapper (unsigned char *d_in, int *d_bwtIndex, unsigned char *d_bwtOut, size_t numElements, const CUDPPBwtPlan *plan)
 Wrapper for calling the Burrows-Wheeler Transform (BWT). More...
 
void moveToFrontTransformWrapper (size_t numElements, const CUDPPCompressPlan *plan)
 Wrapper for calling the Move-to-Front (MTF) transform. More...
 
void moveToFrontTransformWrapper (unsigned char *d_in, unsigned char *d_mtfOut, size_t numElements, const CUDPPMtfPlan *plan)
 Wrapper for calling the Move-to-Front (MTF) transform. More...
 
void allocBwtStorage (CUDPPBwtPlan *plan)
 Allocate intermediate arrays used by BWT. More...
 
void allocMtfStorage (CUDPPMtfPlan *plan)
 Allocate intermediate arrays used by MTF. More...
 
void allocCompressStorage (CUDPPCompressPlan *plan)
 Allocate intermediate arrays used by compression. More...
 
void freeCompressStorage (CUDPPCompressPlan *plan)
 Deallocate intermediate block arrays in a CUDPPCompressPlan object. More...
 
void freeBwtStorage (CUDPPBwtPlan *plan)
 Deallocate intermediate block arrays in a CUDPPBwtPlan object. More...
 
void freeMtfStorage (CUDPPMtfPlan *plan)
 Deallocate intermediate block arrays in a CUDPPMtfPlan object. More...
 
void cudppCompressDispatch (void *d_uncompressed, void *d_bwtIndex, void *d_histSize, void *d_hist, void *d_encodeOffset, void *d_compressedSize, void *d_compressed, size_t numElements, const CUDPPCompressPlan *plan)
 Dispatch function to perform parallel compression on an array with the specified configuration. More...
 
void cudppBwtDispatch (void *d_in, void *d_out, void *d_index, size_t numElements, const CUDPPBwtPlan *plan)
 Dispatch function to perform the Burrows-Wheeler transform. More...
 
void cudppMtfDispatch (void *d_in, void *d_out, size_t numElements, const CUDPPMtfPlan *plan)
 Dispatch function to perform the Move-to-Front transform. More...
 

ListRank Functions

template<typename T >
void listRank (T *d_ranked_values, T *d_unranked_values, int *d_next_indices, size_t head, size_t numElements, const CUDPPListRankPlan *plan)
 Launch list ranking. More...
 
void allocListRankStorage (CUDPPListRankPlan *plan)
 Allocate intermediate arrays used by ListRank. More...
 
void freeListRankStorage (CUDPPListRankPlan *plan)
 Deallocate intermediate block arrays in a CUDPPListRankPlan object. More...
 
CUDPPResult cudppListRankDispatch (void *d_ranked_values, void *d_unranked_values, void *d_next_indices, size_t head, size_t numElements, const CUDPPListRankPlan *plan)
 Dispatch function to perform parallel list ranking on a linked-list with the specified configuration. More...
 

MergeSort Functions

template<typename T >
void runMergeSort (T *pkeys, unsigned int *pvals, size_t numElements, const CUDPPMergeSortPlan *plan)
 Performs merge sort utilizing 3 stages: (1) Blocksort, (2) simple merge and (3) multi merge. More...
 
void allocMergeSortStorage (CUDPPMergeSortPlan *plan)
 From the programmer-specified sort configuration, creates internal memory for performing the sort. More...
 
void freeMergeSortStorage (CUDPPMergeSortPlan *plan)
 Deallocates intermediate memory from allocRadixSortStorage. More...
 
void cudppMergeSortDispatch (void *keys, void *values, size_t numElements, const CUDPPMergeSortPlan *plan)
 Dispatch function to perform a sort on an array with a specified configuration. More...
 
#define BLOCKSORT_SIZE   1024
 
#define DEPTH   8
 

RadixSort Functions

void allocRadixSortStorage (CUDPPRadixSortPlan *plan)
 From the programmer-specified sort configuration, creates internal memory for performing the sort. More...
 
void freeRadixSortStorage (CUDPPRadixSortPlan *plan)
 Deallocates intermediate memory from allocRadixSortStorage. More...
 
template<typename T >
void runSort (T *pkeys, unsigned int *pvals, size_t numElements, const CUDPPRadixSortPlan *plan)
 
void cudppRadixSortDispatch (void *keys, void *values, size_t numElements, const CUDPPRadixSortPlan *plan)
 Dispatch function to perform a sort on an array with a specified configuration. More...
 

Suffix Array Functions

void KeyValueSort (unsigned int num_elements, unsigned int *d_keys, unsigned int *d_values)
 Radix Sort kernel from NVlab cub library. More...
 
void ComputeSA (unsigned int *d_str, unsigned int *d_keys_sa, size_t str_length, mgpu::CudaContext &context, CUDPPSaPlan *plan, unsigned int offset, unsigned int stage)
 Perform Suffix Array (SA) using skew algorithm. More...
 
void allocSaStorage (CUDPPSaPlan *plan)
 Allocate intermediate arrays used by suffix array. More...
 
void freeSaStorage (CUDPPSaPlan *plan)
 Deallocate intermediate block arrays in a CUDPPSaPlan object. More...
 
void cudppSuffixArrayDispatch (void *d_str, unsigned int *d_keys_sa, size_t d_str_length, CUDPPSaPlan *plan)
 Dispatch function to perform parallel suffix array on a string with the specified configuration. More...
 

Scan Functions

template<class T , bool isBackward, bool isExclusive, class Op >
void scanArrayRecursive (T *d_out, const T *d_in, T **d_blockSums, size_t numElements, size_t numRows, const size_t *rowPitches, int level)
 Perform recursive scan on arbitrary size arrays. More...
 
void allocScanStorage (CUDPPScanPlan *plan)
 Allocate intermediate arrays used by scan. More...
 
void freeScanStorage (CUDPPScanPlan *plan)
 Deallocate intermediate block sums arrays in a CUDPPScanPlan object. More...
 
template<typename T , bool isBackward, bool isExclusive>
void cudppScanDispatchOperator (void *d_out, const void *d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan)
 
template<bool isBackward, bool isExclusive>
void cudppScanDispatchType (void *d_out, const void *d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan)
 
void cudppScanDispatch (void *d_out, const void *d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan)
 Dispatch function to perform a scan (prefix sum) on an array with the specified configuration. More...
 

StringSort Functions

void dotAdd (unsigned int *d_address, unsigned int *numSpaces, unsigned int *packedAddress, size_t numElements, size_t stringArrayLength)
 
void calculateAlignedOffsets (unsigned int *d_address, unsigned int *numSpaces, unsigned char *d_stringVals, unsigned char termC, size_t numElements, size_t stringArrayLength)
 
void packStrings (unsigned int *packedStrings, unsigned char *d_stringVals, unsigned int *d_keys, unsigned int *packedAddress, unsigned int *address, size_t numElements, size_t stringArrayLength, unsigned char termC)
 
void unpackStrings (unsigned int *packedAddress, unsigned int *packedAddressRef, unsigned int *address, unsigned int *addressRef, size_t numElements)
 
void runStringSort (unsigned int *pkeys, unsigned int *pvals, unsigned int *stringVals, size_t numElements, size_t stringArrayLength, unsigned char termC, const CUDPPStringSortPlan *plan)
 Performs merge sor utilzing three stages. (1) Blocksort, (2) simple merge and (3) multi merge on a set of strings. More...
 
void allocStringSortStorage (CUDPPStringSortPlan *plan)
 From the programmer-specified sort configuration, creates internal memory for performing the sort. More...
 
void freeStringSortStorage (CUDPPStringSortPlan *plan)
 Deallocates intermediate memory from allocStringSortStorage. More...
 
void cudppStringSortDispatch (unsigned int *keys, unsigned int *values, unsigned int *stringVals, size_t numElements, size_t stringArrayLength, unsigned char termC, const CUDPPStringSortPlan *plan)
 Dispatch function to perform a sort on an array with a specified configuration. More...
 
#define BLOCKSORT_SIZE   1024
 
#define DEPTH   8
 

Tridiagonal functions

template<typename T >
unsigned int crpcrSharedSize (unsigned int systemSizeOriginal)
 
template<typename T >
void crpcr (T *d_a, T *d_b, T *d_c, T *d_d, T *d_x, unsigned int systemSizeOriginal, unsigned int numSystems)
 Hybrid CR-PCR solver (CRPCR) More...
 
CUDPPResult cudppTridiagonalDispatch (void *d_a, void *d_b, void *d_c, void *d_d, void *d_x, int systemSize, int numSystems, const CUDPPTridiagonalPlan *plan)
 Dispatches the tridiagonal function based on the plan. More...
 

Detailed Description

The CUDPP Application-Level API contains functions that run on the host CPU and invoke GPU routines in the CUDPP Kernel-Level API. Application-Level API functions are used by CUDPP Public Interface functions to implement CUDPP's core functionality.

Function Documentation

void calculateCompactLaunchParams ( const unsigned int  numElements,
unsigned int &  numThreads,
unsigned int &  numBlocks,
unsigned int &  numEltsPerBlock 
)

Calculate launch parameters for compactArray().

Calculates the block size and number of blocks from the total number of elements and the maximum threads per block. Called by compactArray().

The calculation is pretty straightforward - the number of blocks is calculated by dividing the number of input elements by the product of the number of threads in each CTA and the number of elements each thread will process. numThreads and numEltsPerBlock are also simple to calculate. Please note that in cases where numElements is not an exact multiple of SCAN_ELTS_PER_THREAD * CTA_SIZE we would have threads which do nothing or have a thread which will process less than SCAN_ELTS_PER_THREAD elements.

Parameters
[in]numElementsNumber of elements to sort
[out]numThreadsNumber of threads in each block
[out]numBlocksNumber of blocks
[out]numEltsPerBlockNumber of elements processed per block
template<class T >
void compactArray ( T *  d_out,
size_t *  d_numValidElements,
const T *  d_in,
const unsigned int *  d_isValid,
size_t  numElements,
const CUDPPCompactPlan plan 
)

Compact the non-zero elements of an array.

Given an input array d_in, compactArray() outputs a compacted version which does not have null (zero) elements. Also ouputs the number of non-zero elements in the compacted array. Called by cudppCompactDispatch().

The algorithm is straightforward, involving two steps (most of the complexity is hidden in scan, invoked with cudppScanDispatch() ).

  1. scanArray() performs a prefix sum on d_isValid to compute output indices.
  2. compactData() takes d_in and an intermediate array of output indices as input and writes the values with valid flags in d_isValid into d_out using the output indices.
Parameters
[out]d_outArray of compacted non-null elements
[out]d_numValidElementsPointer to unsigned int to store number of non-null elements
[in]d_inInput array
[out]d_isValidArray of flags, 1 for each non-null element, 0 for each null element. Same length as d_in
[in]numElementsNumber of elements in input array
[in]planPointer to the plan object used for this compact
void allocCompactStorage ( CUDPPCompactPlan plan)

Allocate intermediate arrays used by cudppCompact().

In addition to the internal CUDPPScanPlan contained in CUDPPCompactPlan, CUDPPCompact also needs a temporary device array of output indices, which is allocated by this function.

Parameters
planPointer to CUDPPCompactPlan object within which intermediate storage is allocated.
void freeCompactStorage ( CUDPPCompactPlan plan)

Deallocate intermediate storage used by cudppCompact().

Deallocates the output indices array allocated by allocCompactStorage().

Parameters
planPointer to CUDPPCompactPlan object initialized by allocCompactStorage().
void cudppCompactDispatch ( void *  d_out,
size_t *  d_numValidElements,
const void *  d_in,
const unsigned int *  d_isValid,
size_t  numElements,
const CUDPPCompactPlan plan 
)

Dispatch compactArray for the specified datatype.

A thin wrapper on top of compactArray which calls compactArray() for the data type specified in config. This is the app-level interface to compact used by cudppCompact().

Parameters
[out]d_outCompacted array of non-zero elements
[out]d_numValidElementsPointer to an unsigned int to store the number of non-zero elements
[in]d_inInput array
[in]d_isValidArray of boolean valid flags with same length as d_in
[in]numElementsNumber of elements to compact
[in]planPointer to plan object for this compact
void huffmanEncoding ( unsigned int *  d_hist,
unsigned int *  d_encodeOffset,
unsigned int *  d_compressedSize,
unsigned int *  d_compressed,
size_t  numElements,
const CUDPPCompressPlan *  plan 
)

Perform Huffman encoding.

Performs Huffman encoding on the input data stream. The input data stream is the output data stream from the previous stage (MTF) in our compress stream.

The input is given by the output of the Move-to-Front transform (MTF). There are a few things that need to be store along with the compressed data. We also store the word offset of the compressed data stream because our data is compressed into indepedent blocks (word granularity) so that they can be encoded and decoded in parallel. The number of independent blocks is HUFF_THREADS_PER_BLOCK*HUFF_WORK_PER_THREAD.

Parameters
[out]d_histHistogram array of the input data stream used for decoding.
[out]d_encodeOffsetAn array of the word offsets of the independent compressed data blocks.
[out]d_compressedSizePointer to the total size in words of all compressed data blocks combined.
[out]d_compressedA pointer to the compressed data blocks.
[in]numElementsTotal number of input elements to compress.
[in]planPointer to the plan object used for this compress.
template<class T >
void moveToFrontTransform ( unsigned char *  d_mtfIn,
unsigned char *  d_mtfOut,
size_t  numElements,
const T *  plan 
)

Perform the Move-to-Front Transform (MTF)

Performs a Move-to-Front (MTF) transform on the input data stream. The MTF transform is the second stage in our compress pipeline. The MTF manipulates the input data stream to improve the performance of entropy encoding.

Parameters
[in]d_mtfInAn array of the input data stream to perform the MTF transform on.
[out]d_mtfOutAn array to store the output of the MTF transform.
[in]numElementsTotal number of input elements of the MTF transform.
[in]planPointer to the plan object used for this MTF transform.
template<class T >
void burrowsWheelerTransform ( unsigned char *  d_uncompressed,
int *  d_bwtIndex,
unsigned char *  d_bwtOut,
size_t  numElements,
const T *  plan 
)

Perform the Burrows-Wheeler Transform (BWT)

Performs the Burrows-Wheeler Transform (BWT) on a given character string. The BWT is an algorithm which is commonly used in compression applications, mainly bzip2. The BWT orders the characters in such a way that the output tends to have many long runs of repeated characters. This bodes well for later stages in compression pipelines which perform better with repeated characters.

Parameters
[in]d_uncompressedA char array of the input data stream to perform the BWT on.
[out]d_bwtIndexThe index at which the original string in the BWT sorts to.
[out]d_bwtOutAn array to store the output of the BWT.
[in]numElementsTotal number of input elements of the BWT.
[in]planPointer to the plan object used for this BWT.
void burrowsWheelerTransformWrapper ( unsigned char *  d_in,
int *  d_bwtIndex,
size_t  numElements,
const CUDPPCompressPlan *  plan 
)

Wrapper for calling the Burrows-Wheeler Transform (BWT).

This is a wrapper function for calling the BWT. This wrapper is used internally via the compress application to call burrowsWheelerTransform().

Parameters
[in]d_inA char array of the input data stream to perform the BWT on.
[out]d_bwtIndexThe index at which the original string in the BWT sorts to.
[in]numElementsTotal number of input elements to the compress stream.
[in]planPointer to the plan object used for this compress.
void burrowsWheelerTransformWrapper ( unsigned char *  d_in,
int *  d_bwtIndex,
unsigned char *  d_bwtOut,
size_t  numElements,
const CUDPPBwtPlan plan 
)

Wrapper for calling the Burrows-Wheeler Transform (BWT).

This is a wrapper function for calling the BWT. This wrapper is used internally via the BWT primitive to call burrowsWheelerTransform().

Parameters
[in]d_inA char array of the input data stream to perform the BWT on.
[out]d_bwtIndexThe index at which the original string in the BWT sorts to.
[out]d_bwtOutAn array to store the output of the BWT.
[in]numElementsTotal number of input elements to the BWT.
[in]planPointer to the plan object used for this BWT.
void moveToFrontTransformWrapper ( size_t  numElements,
const CUDPPCompressPlan *  plan 
)

Wrapper for calling the Move-to-Front (MTF) transform.

This is a wrapper function for calling the MTF. This wrapper is used internally via the compress application to call moveToFrontTransform().

Parameters
[in]numElementsTotal number of input elements to the MTF transform.
[in]planPointer to the plan object used for this compress.
void moveToFrontTransformWrapper ( unsigned char *  d_in,
unsigned char *  d_mtfOut,
size_t  numElements,
const CUDPPMtfPlan plan 
)

Wrapper for calling the Move-to-Front (MTF) transform.

This is a wrapper function for calling the MTF. This wrapper is used internally via the MTF primitive to call moveToFrontTransform().

Parameters
[in]d_inAn input char array to perform the MTF on.
[in]d_mtfOutAn output char array to store the MTF transformed stream.
[in]numElementsTotal number of input elements to the MTF transform.
[in]planPointer to the plan object used for this MTF.
void allocBwtStorage ( CUDPPBwtPlan plan)

Allocate intermediate arrays used by BWT.

Parameters
[in,out]planPointer to CUDPPBwtPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.
void allocMtfStorage ( CUDPPMtfPlan plan)

Allocate intermediate arrays used by MTF.

Parameters
[in,out]planPointer to CUDPPMtfPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.
void allocCompressStorage ( CUDPPCompressPlan *  plan)

Allocate intermediate arrays used by compression.

Parameters
[in,out]planPointer to CUDPPCompressPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.
void freeCompressStorage ( CUDPPCompressPlan *  plan)

Deallocate intermediate block arrays in a CUDPPCompressPlan object.

Parameters
[in,out]planPointer to CUDPPCompressPlan object initialized by allocCompressStorage().
void freeBwtStorage ( CUDPPBwtPlan plan)

Deallocate intermediate block arrays in a CUDPPBwtPlan object.

Parameters
[in,out]planPointer to CUDPPBwtPlan object initialized by allocBwtStorage().
void freeMtfStorage ( CUDPPMtfPlan plan)

Deallocate intermediate block arrays in a CUDPPMtfPlan object.

Parameters
[in,out]planPointer to CUDPPMtfPlan object initialized by allocMtfStorage().
void cudppCompressDispatch ( void *  d_uncompressed,
void *  d_bwtIndex,
void *  d_histSize,
void *  d_hist,
void *  d_encodeOffset,
void *  d_compressedSize,
void *  d_compressed,
size_t  numElements,
const CUDPPCompressPlan *  plan 
)

Dispatch function to perform parallel compression on an array with the specified configuration.

Parameters
[in]d_uncompressedUncompressed data
[out]d_bwtIndexBWT Index
[out]d_histSizeHistogram size
[out]d_histHistogram
[out]d_encodeOffsetEncoded offset table
[out]d_compressedSizeSize of compressed data
[out]d_compressedCompressed data
[in]numElementsNumber of elements to compress
[in]planPointer to CUDPPCompressPlan object containing compress options and intermediate storage
void cudppBwtDispatch ( void *  d_in,
void *  d_out,
void *  d_index,
size_t  numElements,
const CUDPPBwtPlan plan 
)

Dispatch function to perform the Burrows-Wheeler transform.

Parameters
[in]d_inInput data
[out]d_outTransformed data
[out]d_indexBWT Index
[in]numElementsNumber of elements to compress
[in]planPointer to CUDPPBwtPlan object containing compress options and intermediate storage
void cudppMtfDispatch ( void *  d_in,
void *  d_out,
size_t  numElements,
const CUDPPMtfPlan plan 
)

Dispatch function to perform the Move-to-Front transform.

Parameters
[in]d_inInput data
[out]d_outTransformed data
[in]numElementsNumber of elements to compress
[in]planPointer to CUDPPMtfPlan object containing compress options and intermediate storage
template<typename T >
void listRank ( T *  d_ranked_values,
T *  d_unranked_values,
int *  d_next_indices,
size_t  head,
size_t  numElements,
const CUDPPListRankPlan plan 
)

Launch list ranking.

Given two inputs arrays, d_unranked_values and d_next_indices, listRank() outputs a ranked version of the unranked values by traversing the next indices. The head index is head. Called by cudppListRankDispatch().

Parameters
[out]d_ranked_valuesRanked values array
[in]d_unranked_valuesUnranked values array
[in]d_next_indicesNext indices array
[in]headHead pointer index
[in]numElementsNumber of nodes values to rank
[in]planPointer to CUDPPListRankPlan object containing list ranking options and intermediate storage
void allocListRankStorage ( CUDPPListRankPlan plan)

Allocate intermediate arrays used by ListRank.

CUDPPListRank needs temporary device arrays to store next indices so that it does not overwrite the original array.

Parameters
[in,out]planPointer to CUDPPListRankPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.
void freeListRankStorage ( CUDPPListRankPlan plan)

Deallocate intermediate block arrays in a CUDPPListRankPlan object.

Deallocates the output indices array allocated by allocListRankStorage().

Parameters
[in,out]planPointer to CUDPPListRankPlan object initialized by allocListRankStorage().
CUDPPResult cudppListRankDispatch ( void *  d_ranked_values,
void *  d_unranked_values,
void *  d_next_indices,
size_t  head,
size_t  numElements,
const CUDPPListRankPlan plan 
)

Dispatch function to perform parallel list ranking on a linked-list with the specified configuration.

A wrapper on top of listRank which calls listRank() for the data type specified in config. This is the app-level interface to list ranking used by cudppListRank().

Parameters
[out]d_ranked_valuesRanked values array
[in]d_unranked_valuesUnranked values array
[in]d_next_indicesNext indices array
[in]headHead pointer index
[in]numElementsNumber of nodes values to rank
[in]planPointer to CUDPPListRankPlan object containing list ranking options and intermediate storage
Returns
CUDPPResult indicating success or error condition
template<typename T >
void runMergeSort ( T *  pkeys,
unsigned int *  pvals,
size_t  numElements,
const CUDPPMergeSortPlan plan 
)

Performs merge sort utilizing 3 stages: (1) Blocksort, (2) simple merge and (3) multi merge.

Parameters
[in,out]pkeysKeys to be sorted.
[in,out]pvalsAssociated values to be sorted
[in]numElementsNumber of elements in the sort.
[in]planConfiguration information for mergesort.
void allocMergeSortStorage ( CUDPPMergeSortPlan plan)

From the programmer-specified sort configuration, creates internal memory for performing the sort.

Parameters
[in]planPointer to CUDPPMergeSortPlan object
void freeMergeSortStorage ( CUDPPMergeSortPlan plan)

Deallocates intermediate memory from allocRadixSortStorage.

Parameters
[in]planPointer to CUDPPMergeSortPlan object
void cudppMergeSortDispatch ( void *  keys,
void *  values,
size_t  numElements,
const CUDPPMergeSortPlan plan 
)

Dispatch function to perform a sort on an array with a specified configuration.

This is the dispatch routine which calls mergeSort...() with appropriate template parameters and arguments as specified by the plan. Currently only sorts keys of type int, unsigned int, and float.

Parameters
[in,out]keysKeys to be sorted.
[in,out]valuesAssociated values to be sorted (through keys).
[in]numElementsNumber of elements in the sort.
[in]planConfiguration information for mergeSort.
void allocRadixSortStorage ( CUDPPRadixSortPlan plan)

From the programmer-specified sort configuration, creates internal memory for performing the sort.

Parameters
[in]planPointer to CUDPPRadixSortPlan object
void freeRadixSortStorage ( CUDPPRadixSortPlan plan)

Deallocates intermediate memory from allocRadixSortStorage.

Parameters
[in]planPointer to CUDPPRadixSortPlan object
void cudppRadixSortDispatch ( void *  keys,
void *  values,
size_t  numElements,
const CUDPPRadixSortPlan plan 
)

Dispatch function to perform a sort on an array with a specified configuration.

This is the dispatch routine which calls radixSort...() with appropriate template parameters and arguments as specified by the plan.

Parameters
[in,out]keysKeys to be sorted.
[in,out]valuesAssociated values to be sorted (through keys).
[in]numElementsNumber of elements in the sort.
[in]planConfiguration information for RadixSort.
void KeyValueSort ( unsigned int  num_elements,
unsigned int *  d_keys,
unsigned int *  d_values 
)

Radix Sort kernel from NVlab cub library.

Parameters
[in]num_elementsNumber of elements to sort.
[in]d_keysKey values of the elements in the array to be sorted.
[in,out]d_valuesPositions of the elements in the array.
void ComputeSA ( unsigned int *  d_str,
unsigned int *  d_keys_sa,
size_t  str_length,
mgpu::CudaContext &  context,
CUDPPSaPlan plan,
unsigned int  offset,
unsigned int  stage 
)

Perform Suffix Array (SA) using skew algorithm.

Performs recursive skew kernel on a given character string. A suffix array is a sorted array of all suffixes of a string. Skew algorithm is a linear-time algorithm based on divde and conquer. The SA of a string can be used as an index to quickly locate every occurrence of a substring pattern within the string. Suffix sorting algorithms can be used to compute the Burrows-Wheeler Transform(BWT). The BWT requires sorting of all cyclic permutations of a string, thus can be computed in linear time by using a suffix array of the string.

Parameters
[in]d_strAn unsigned int array of the input data stream to perform the SA on.
[out]d_keys_saAn array to store the output of the SA.
[in]str_lengthTotal number of input elements.
[in]contextContext format required by mgpu functions.
[in,out]planPointer to the plan object used for this suffix array.
[in,out]offsetOffset to move head pointer to find the memory for each iteration.
[in,out]stageStage for each iteration.
void allocSaStorage ( CUDPPSaPlan plan)

Allocate intermediate arrays used by suffix array.

Parameters
[in,out]planPointer to CUDPPSaPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.
void freeSaStorage ( CUDPPSaPlan plan)

Deallocate intermediate block arrays in a CUDPPSaPlan object.

Parameters
[in,out]planPointer to CUDPPSaPlan object initialized by allocSaStorage().
void cudppSuffixArrayDispatch ( void *  d_str,
unsigned int *  d_keys_sa,
size_t  d_str_length,
CUDPPSaPlan plan 
)

Dispatch function to perform parallel suffix array on a string with the specified configuration.

Parameters
[in]d_strinput string with three $
[out]d_keys_salexicographically sorted suffix position array
[in]d_str_lengthNumber of elements in the string including $
[in]planPointer to CUDPPSaPlan object containing suffix_array options and intermediate storage
template<class T , bool isBackward, bool isExclusive, class Op >
void scanArrayRecursive ( T *  d_out,
const T *  d_in,
T **  d_blockSums,
size_t  numElements,
size_t  numRows,
const size_t *  rowPitches,
int  level 
)

Perform recursive scan on arbitrary size arrays.

This is the CPU-side workhorse function of the scan engine. This function invokes the CUDA kernels which perform the scan on individual blocks.

Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans, where each block is scanned by a single CUDA thread block. At each recursive level of the scanArrayRecursive first invokes a kernel to scan all blocks of that level, and if the level has more than one block, it calls itself recursively. On returning from each recursive level, the total sum of each block from the level below is added to all elements of the corresponding block in this level. See "Parallel Prefix Sum (Scan) in CUDA" for more information (see References ).

Template parameter T is the datatype; isBackward specifies backward or forward scan; isExclusive specifies exclusive or inclusive scan, and op specifies the binary associative operator to be used.

Parameters
[out]d_outThe output array for the scan results
[in]d_inThe input array to be scanned
[out]d_blockSumsArray of arrays of per-block sums (one array per recursive level, allocated by allocScanStorage())
[in]numElementsThe number of elements in the array to scan
[in]numRowsThe number of rows in the array to scan
[in]rowPitchesArray of row pitches (one array per recursive level, allocated by allocScanStorage())
[in]levelThe current recursive level of the scan
void allocScanStorage ( CUDPPScanPlan plan)

Allocate intermediate arrays used by scan.

Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans, where each block is scanned by a single CUDA thread block. At each recursive level of the scan, we need an array in which to store the total sums of all blocks in that level. This function computes the amount of storage needed and allocates it.

Parameters
planPointer to CUDPPScanPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated.
void freeScanStorage ( CUDPPScanPlan plan)

Deallocate intermediate block sums arrays in a CUDPPScanPlan object.

These arrays must have been allocated by allocScanStorage(), which is called by the constructor of cudppScanPlan().

Parameters
planPointer to CUDPPScanPlan object initialized by allocScanStorage().
void cudppScanDispatch ( void *  d_out,
const void *  d_in,
size_t  numElements,
size_t  numRows,
const CUDPPScanPlan plan 
)

Dispatch function to perform a scan (prefix sum) on an array with the specified configuration.

This is the dispatch routine which calls scanArrayRecursive() with appropriate template parameters and arguments to achieve the scan as specified in plan.

Parameters
[out]d_outThe output array of scan results
[in]d_inThe input array
[in]numElementsThe number of elements to scan
[in]numRowsThe number of rows to scan in parallel
[in]planPointer to CUDPPScanPlan object containing scan options and intermediate storage
void runStringSort ( unsigned int *  pkeys,
unsigned int *  pvals,
unsigned int *  stringVals,
size_t  numElements,
size_t  stringArrayLength,
unsigned char  termC,
const CUDPPStringSortPlan plan 
)

Performs merge sor utilzing three stages. (1) Blocksort, (2) simple merge and (3) multi merge on a set of strings.

Parameters
[in,out]pkeysKeys (first four characters of string) to be sorted.
[in,out]pvalsAddresses of string locations for tie-breaks
[out]stringValsglobal string value array (four characters stuffed into a uint)
[in]numElementsNumber of elements in the sort.
[in]stringArrayLengthThe size of our string array in uints (4 chars per uint)
[in]planConfiguration information for mergesort.
[in]termCTermination character for our strings
void allocStringSortStorage ( CUDPPStringSortPlan plan)

From the programmer-specified sort configuration, creates internal memory for performing the sort.

Parameters
[in]planPointer to CUDPPStringSortPlan object
void freeStringSortStorage ( CUDPPStringSortPlan plan)

Deallocates intermediate memory from allocStringSortStorage.

Parameters
[in]planPointer to CUDPStringSortPlan object
void cudppStringSortDispatch ( unsigned int *  keys,
unsigned int *  values,
unsigned int *  stringVals,
size_t  numElements,
size_t  stringArrayLength,
unsigned char  termC,
const CUDPPStringSortPlan plan 
)

Dispatch function to perform a sort on an array with a specified configuration.

This is the dispatch routine which calls stringSort...() with appropriate template parameters and arguments as specified by the plan.

Parameters
[in,out]keysKeys (first four chars of string) to be sorted.
[in,out]valuesAddress of string values in array of null terminated strings
[in]stringValsGlobal string array
[in]numElementsNumber of elements in the sort.
[in]stringArrayLengthThe size of our string array in uints (4 chars per uint)
[in]termCTermination character for our strings
[in]planConfiguration information for mergeSort.
template<typename T >
void crpcr ( T *  d_a,
T *  d_b,
T *  d_c,
T *  d_d,
T *  d_x,
unsigned int  systemSizeOriginal,
unsigned int  numSystems 
)

Hybrid CR-PCR solver (CRPCR)

This is a wrapper function for the GPU CR-PCR kernel.

Parameters
[out]d_xSolution vector
[in]d_aLower diagonal
[in]d_bMain diagonal
[in]d_cUpper diagonal
[in]d_dRight hand side
[in]systemSizeOriginalThe size of the linear system
[in]numSystemsThe number of systems to be solved
CUDPPResult cudppTridiagonalDispatch ( void *  d_a,
void *  d_b,
void *  d_c,
void *  d_d,
void *  d_x,
int  systemSize,
int  numSystems,
const CUDPPTridiagonalPlan plan 
)

Dispatches the tridiagonal function based on the plan.

This is the dispatch call for the tridiagonal solver in either float or double datatype.

Parameters
[out]d_xSolution vector
[in]d_aLower diagonal
[in]d_bMain diagonal
[in]d_cUpper diagonal
[in]d_dRight hand side
[in]systemSizeThe size of the linear system
[in]numSystemsThe number of systems to be solved
[in]planpointer to CUDPPTridiagonalPlan
Returns
CUDPPResult indicating success or error condition