CUDPP
2.3
CUDA Data-Parallel Primitives Library
|
Modules | |
Hash Table Data Structures and Constants | |
Classes | |
class | CudaHT::CuckooHashing::HashTable |
Basic hash table that stores one value for each key. More... | |
Compact Functions | |
void | calculateCompactLaunchParams (const unsigned int numElements, unsigned int &numThreads, unsigned int &numBlocks, unsigned int &numEltsPerBlock) |
Calculate launch parameters for compactArray(). More... | |
template<class T > | |
void | compactArray (T *d_out, size_t *d_numValidElements, const T *d_in, const unsigned int *d_isValid, size_t numElements, const CUDPPCompactPlan *plan) |
Compact the non-zero elements of an array. More... | |
void | allocCompactStorage (CUDPPCompactPlan *plan) |
Allocate intermediate arrays used by cudppCompact(). More... | |
void | freeCompactStorage (CUDPPCompactPlan *plan) |
Deallocate intermediate storage used by cudppCompact(). More... | |
void | cudppCompactDispatch (void *d_out, size_t *d_numValidElements, const void *d_in, const unsigned int *d_isValid, size_t numElements, const CUDPPCompactPlan *plan) |
Dispatch compactArray for the specified datatype. More... | |
Compress Functions | |
void | huffmanEncoding (unsigned int *d_hist, unsigned int *d_encodeOffset, unsigned int *d_compressedSize, unsigned int *d_compressed, size_t numElements, const CUDPPCompressPlan *plan) |
Perform Huffman encoding. More... | |
template<class T > | |
void | moveToFrontTransform (unsigned char *d_mtfIn, unsigned char *d_mtfOut, size_t numElements, const T *plan) |
Perform the Move-to-Front Transform (MTF) More... | |
template<class T > | |
void | burrowsWheelerTransform (unsigned char *d_uncompressed, int *d_bwtIndex, unsigned char *d_bwtOut, size_t numElements, const T *plan) |
Perform the Burrows-Wheeler Transform (BWT) More... | |
void | burrowsWheelerTransformWrapper (unsigned char *d_in, int *d_bwtIndex, size_t numElements, const CUDPPCompressPlan *plan) |
Wrapper for calling the Burrows-Wheeler Transform (BWT). More... | |
void | burrowsWheelerTransformWrapper (unsigned char *d_in, int *d_bwtIndex, unsigned char *d_bwtOut, size_t numElements, const CUDPPBwtPlan *plan) |
Wrapper for calling the Burrows-Wheeler Transform (BWT). More... | |
void | moveToFrontTransformWrapper (size_t numElements, const CUDPPCompressPlan *plan) |
Wrapper for calling the Move-to-Front (MTF) transform. More... | |
void | moveToFrontTransformWrapper (unsigned char *d_in, unsigned char *d_mtfOut, size_t numElements, const CUDPPMtfPlan *plan) |
Wrapper for calling the Move-to-Front (MTF) transform. More... | |
void | allocBwtStorage (CUDPPBwtPlan *plan) |
Allocate intermediate arrays used by BWT. More... | |
void | allocMtfStorage (CUDPPMtfPlan *plan) |
Allocate intermediate arrays used by MTF. More... | |
void | allocCompressStorage (CUDPPCompressPlan *plan) |
Allocate intermediate arrays used by compression. More... | |
void | freeCompressStorage (CUDPPCompressPlan *plan) |
Deallocate intermediate block arrays in a CUDPPCompressPlan object. More... | |
void | freeBwtStorage (CUDPPBwtPlan *plan) |
Deallocate intermediate block arrays in a CUDPPBwtPlan object. More... | |
void | freeMtfStorage (CUDPPMtfPlan *plan) |
Deallocate intermediate block arrays in a CUDPPMtfPlan object. More... | |
void | cudppCompressDispatch (void *d_uncompressed, void *d_bwtIndex, void *d_histSize, void *d_hist, void *d_encodeOffset, void *d_compressedSize, void *d_compressed, size_t numElements, const CUDPPCompressPlan *plan) |
Dispatch function to perform parallel compression on an array with the specified configuration. More... | |
void | cudppBwtDispatch (void *d_in, void *d_out, void *d_index, size_t numElements, const CUDPPBwtPlan *plan) |
Dispatch function to perform the Burrows-Wheeler transform. More... | |
void | cudppMtfDispatch (void *d_in, void *d_out, size_t numElements, const CUDPPMtfPlan *plan) |
Dispatch function to perform the Move-to-Front transform. More... | |
ListRank Functions | |
template<typename T > | |
void | listRank (T *d_ranked_values, T *d_unranked_values, int *d_next_indices, size_t head, size_t numElements, const CUDPPListRankPlan *plan) |
Launch list ranking. More... | |
void | allocListRankStorage (CUDPPListRankPlan *plan) |
Allocate intermediate arrays used by ListRank. More... | |
void | freeListRankStorage (CUDPPListRankPlan *plan) |
Deallocate intermediate block arrays in a CUDPPListRankPlan object. More... | |
CUDPPResult | cudppListRankDispatch (void *d_ranked_values, void *d_unranked_values, void *d_next_indices, size_t head, size_t numElements, const CUDPPListRankPlan *plan) |
Dispatch function to perform parallel list ranking on a linked-list with the specified configuration. More... | |
MergeSort Functions | |
template<typename T > | |
void | runMergeSort (T *pkeys, unsigned int *pvals, size_t numElements, const CUDPPMergeSortPlan *plan) |
Performs merge sort utilizing 3 stages: (1) Blocksort, (2) simple merge and (3) multi merge. More... | |
void | allocMergeSortStorage (CUDPPMergeSortPlan *plan) |
From the programmer-specified sort configuration, creates internal memory for performing the sort. More... | |
void | freeMergeSortStorage (CUDPPMergeSortPlan *plan) |
Deallocates intermediate memory from allocRadixSortStorage. More... | |
void | cudppMergeSortDispatch (void *keys, void *values, size_t numElements, const CUDPPMergeSortPlan *plan) |
Dispatch function to perform a sort on an array with a specified configuration. More... | |
#define | BLOCKSORT_SIZE 1024 |
#define | DEPTH 8 |
MultiSplit Functions | |
template<typename bucket_t , typename key_type > | |
void | multisplit_WMS_prescan_function (key_type *d_key_in, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_raw, uint32_t &num_blocks_pre, uint32_t &num_sub_problems, multisplit_context &context) |
template<typename bucket_t , typename key_type > | |
void | multisplit_WMS_pairs_prescan_function (key_type *d_key_in, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_raw, uint32_t &num_blocks_pre, uint32_t &num_sub_problems, multisplit_context &context) |
template<typename bucket_t , typename key_type > | |
void | multisplit_WMS_postscan_function (key_type *d_key_in, key_type *d_key_out, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_post, multisplit_context &context) |
template<typename bucket_t , typename key_type , typename value_type > | |
void | multisplit_WMS_pairs_postscan_function (key_type *d_key_in, value_type *d_value_in, key_type *d_key_out, value_type *d_value_out, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_post, multisplit_context &context) |
template<typename bucket_t , typename key_type > | |
void | multisplit_BMS_prescan_function (key_type *d_key_in, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_raw, uint32_t &num_blocks_pre, uint32_t &num_sub_problems, multisplit_context &context) |
template<typename bucket_t , typename key_type > | |
void | multisplit_BMS_pairs_prescan_function (key_type *d_key_in, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_raw, uint32_t &num_blocks_pre, uint32_t &num_sub_problems, multisplit_context &context) |
template<typename bucket_t , typename key_type > | |
void | multisplit_BMS_postscan_function (key_type *d_key_in, key_type *d_key_out, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_post, multisplit_context &context) |
template<typename bucket_t , typename key_type , typename value_type > | |
void | multisplit_BMS_pairs_postscan_function (key_type *d_key_in, value_type *d_value_in, key_type *d_key_out, value_type *d_value_out, uint32_t num_elements, bucket_t bucket_identifier, uint32_t num_buckets, uint32_t num_blocks_post, multisplit_context &context) |
void | multisplit_allocate_key_only (size_t num_elements, uint32_t num_buckets, multisplit_context &context) |
void | multisplit_allocate_key_value (size_t num_elements, uint32_t num_buckets, multisplit_context &context) |
void | multisplit_release_memory (multisplit_context &context) |
template<typename key_type , typename bucket_t > | |
void | multisplit_key_only (key_type *d_key_in, key_type *d_key_out, size_t num_elements, uint32_t num_buckets, multisplit_context &context, bucket_t bucket_identifier, bool in_place, uint32_t *bucket_offsets=NULL) |
Performs multisplit on keys only. More... | |
template<typename key_type , typename value_type , typename bucket_t > | |
void | multisplit_key_value (key_type *d_key_in, value_type *d_value_in, key_type *d_key_out, value_type *d_value_out, size_t num_elements, uint32_t num_buckets, multisplit_context &context, bucket_t bucket_identifier, bool in_place, uint32_t *bucket_offsets=NULL) |
Performs multisplit on key-value pairs. More... | |
cub::CachingDeviceAllocator | g_allocator (true) |
template<class T > | |
void | reducedBitSortKeysOnly (unsigned int *d_inp, uint numElements, uint numBuckets, T bucketMapper, const CUDPPMultiSplitPlan *plan) |
Performs multisplit on keys only using the reduced-bit sort method. More... | |
template<class T > | |
void | reducedBitSortKeyValue (unsigned int *d_keys, unsigned int *d_values, unsigned int numElements, unsigned int numBuckets, T bucketMapper, const CUDPPMultiSplitPlan *plan) |
Performs multisplit on key-value pairs using a reduced-bit sort. More... | |
void | allocMultiSplitStorage (CUDPPMultiSplitPlan *plan) |
From the programmer-specified multisplit configuration, creates internal memory for performing the multisplit. Different storage amounts are required depending on the number of buckets. More... | |
void | freeMultiSplitStorage (CUDPPMultiSplitPlan *plan) |
Deallocates intermediate memory from allocMultiSplitStorage. More... | |
void | cudppMultiSplitDispatch (unsigned int *d_keys, unsigned int *d_values, size_t numElements, size_t numBuckets, BucketMappingFunc bucketMappingFunc, const CUDPPMultiSplitPlan *plan) |
Dispatch function to perform multisplit on an array of elements into a number of buckets. More... | |
#define | MULTISPLIT_WMS_K_ONE_ROLL 8 |
#define | MULTISPLIT_WMS_K_TWO_ROLL 8 |
#define | MULTISPLIT_WMS_K_THREE_ROLL 4 |
#define | MULTISPLIT_WMS_K_FOUR_ROLL 4 |
#define | MULTISPLIT_WMS_K_FIVE_ROLL 2 |
#define | MULTISPLIT_WMS_KV_ONE_ROLL 4 |
#define | MULTISPLIT_WMS_KV_TWO_ROLL 4 |
#define | MULTISPLIT_WMS_KV_THREE_ROLL 2 |
#define | MULTISPLIT_WMS_KV_FOUR_ROLL 2 |
#define | MULTISPLIT_WMS_KV_FIVE_ROLL 2 |
#define | MULTISPLIT_BMS_K_ONE_ROLL 8 |
#define | MULTISPLIT_BMS_K_TWO_ROLL 8 |
#define | MULTISPLIT_BMS_K_THREE_ROLL 4 |
#define | MULTISPLIT_BMS_K_FOUR_ROLL 4 |
#define | MULTISPLIT_BMS_K_FIVE_ROLL 4 |
#define | MULTISPLIT_BMS_KV_ONE_ROLL 4 |
#define | MULTISPLIT_BMS_KV_TWO_ROLL 4 |
#define | MULTISPLIT_BMS_KV_THREE_ROLL 2 |
#define | MULTISPLIT_BMS_KV_FOUR_ROLL 2 |
#define | MULTISPLIT_BMS_KV_FIVE_ROLL 2 |
#define | MULTISPLIT_SWITCH_STRATEGY_K 8 |
#define | MULTISPLIT_SWITCH_STRATEGY_KV 8 |
#define | MULTISPLIT_NUM_WARPS 8 |
#define | MULTISPLIT_LOG_WARPS 3 |
#define | MULTISPLIT_WARP_WIDTH 32 |
#define | MULTISPLIT_TRHEADS_PER_BLOCK (MULTISPLIT_WARP_WIDTH * MULTISPLIT_NUM_WARPS) |
RadixSort Functions | |
void | allocRadixSortStorage (CUDPPRadixSortPlan *plan) |
From the programmer-specified sort configuration, creates internal memory for performing the sort. More... | |
void | freeRadixSortStorage (CUDPPRadixSortPlan *plan) |
Deallocates intermediate memory from allocRadixSortStorage. More... | |
template<typename T > | |
void | runSort (T *pkeys, unsigned int *pvals, size_t numElements, const CUDPPRadixSortPlan *plan) |
void | cudppRadixSortDispatch (void *keys, void *values, size_t numElements, const CUDPPRadixSortPlan *plan) |
Dispatch function to perform a sort on an array with a specified configuration. More... | |
Suffix Array Functions | |
void | KeyValueSort (unsigned int num_elements, unsigned int *d_keys, unsigned int *d_values) |
Radix Sort kernel from NVlab cub library. More... | |
void | ComputeSA (unsigned int *d_str, unsigned int *d_keys_sa, size_t str_length, mgpu::CudaContext &context, CUDPPSaPlan *plan, unsigned int offset, unsigned int stage) |
Perform Suffix Array (SA) using skew algorithm. More... | |
void | allocSaStorage (CUDPPSaPlan *plan) |
Allocate intermediate arrays used by suffix array. More... | |
void | freeSaStorage (CUDPPSaPlan *plan) |
Deallocate intermediate block arrays in a CUDPPSaPlan object. More... | |
void | cudppSuffixArrayDispatch (void *d_str, unsigned int *d_keys_sa, size_t d_str_length, CUDPPSaPlan *plan) |
Dispatch function to perform parallel suffix array on a string with the specified configuration. More... | |
Scan Functions | |
template<class T , bool isBackward, bool isExclusive, class Op > | |
void | scanArrayRecursive (T *d_out, const T *d_in, T **d_blockSums, size_t numElements, size_t numRows, const size_t *rowPitches, int level) |
Perform recursive scan on arbitrary size arrays. More... | |
void | allocScanStorage (CUDPPScanPlan *plan) |
Allocate intermediate arrays used by scan. More... | |
void | freeScanStorage (CUDPPScanPlan *plan) |
Deallocate intermediate block sums arrays in a CUDPPScanPlan object. More... | |
template<typename T , bool isBackward, bool isExclusive> | |
void | cudppScanDispatchOperator (void *d_out, const void *d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan) |
template<bool isBackward, bool isExclusive> | |
void | cudppScanDispatchType (void *d_out, const void *d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan) |
void | cudppScanDispatch (void *d_out, const void *d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan) |
Dispatch function to perform a scan (prefix sum) on an array with the specified configuration. More... | |
StringSort Functions | |
void | dotAdd (unsigned int *d_address, unsigned int *numSpaces, unsigned int *packedAddress, size_t numElements, size_t stringArrayLength) |
void | calculateAlignedOffsets (unsigned int *d_address, unsigned int *numSpaces, unsigned char *d_stringVals, unsigned char termC, size_t numElements, size_t stringArrayLength) |
void | packStrings (unsigned int *packedStrings, unsigned char *d_stringVals, unsigned int *d_keys, unsigned int *packedAddress, unsigned int *address, size_t numElements, size_t stringArrayLength, unsigned char termC) |
void | unpackStrings (unsigned int *packedAddress, unsigned int *packedAddressRef, unsigned int *address, unsigned int *addressRef, size_t numElements) |
void | runStringSort (unsigned int *pkeys, unsigned int *pvals, unsigned int *stringVals, size_t numElements, size_t stringArrayLength, unsigned char termC, const CUDPPStringSortPlan *plan) |
Performs merge sor utilzing three stages. (1) Blocksort, (2) simple merge and (3) multi merge on a set of strings. More... | |
void | allocStringSortStorage (CUDPPStringSortPlan *plan) |
From the programmer-specified sort configuration, creates internal memory for performing the sort. More... | |
void | freeStringSortStorage (CUDPPStringSortPlan *plan) |
Deallocates intermediate memory from allocStringSortStorage. More... | |
void | cudppStringSortDispatch (unsigned int *keys, unsigned int *values, unsigned int *stringVals, size_t numElements, size_t stringArrayLength, unsigned char termC, const CUDPPStringSortPlan *plan) |
Dispatch function to perform a sort on an array with a specified configuration. More... | |
#define | BLOCKSORT_SIZE 1024 |
#define | DEPTH 8 |
Tridiagonal functions | |
template<typename T > | |
unsigned int | crpcrSharedSize (unsigned int systemSizeOriginal) |
template<typename T > | |
void | crpcr (T *d_a, T *d_b, T *d_c, T *d_d, T *d_x, unsigned int systemSizeOriginal, unsigned int numSystems) |
Hybrid CR-PCR solver (CRPCR) More... | |
CUDPPResult | cudppTridiagonalDispatch (void *d_a, void *d_b, void *d_c, void *d_d, void *d_x, int systemSize, int numSystems, const CUDPPTridiagonalPlan *plan) |
Dispatches the tridiagonal function based on the plan. More... | |
The CUDPP Application-Level API contains functions that run on the host CPU and invoke GPU routines in the CUDPP Kernel-Level API. Application-Level API functions are used by CUDPP Public Interface functions to implement CUDPP's core functionality.
void calculateCompactLaunchParams | ( | const unsigned int | numElements, |
unsigned int & | numThreads, | ||
unsigned int & | numBlocks, | ||
unsigned int & | numEltsPerBlock | ||
) |
Calculate launch parameters for compactArray().
Calculates the block size and number of blocks from the total number of elements and the maximum threads per block. Called by compactArray().
The calculation is pretty straightforward - the number of blocks is calculated by dividing the number of input elements by the product of the number of threads in each CTA and the number of elements each thread will process. numThreads and numEltsPerBlock are also simple to calculate. Please note that in cases where numElements is not an exact multiple of SCAN_ELTS_PER_THREAD * CTA_SIZE we would have threads which do nothing or have a thread which will process less than SCAN_ELTS_PER_THREAD elements.
[in] | numElements | Number of elements to sort |
[out] | numThreads | Number of threads in each block |
[out] | numBlocks | Number of blocks |
[out] | numEltsPerBlock | Number of elements processed per block |
void compactArray | ( | T * | d_out, |
size_t * | d_numValidElements, | ||
const T * | d_in, | ||
const unsigned int * | d_isValid, | ||
size_t | numElements, | ||
const CUDPPCompactPlan * | plan | ||
) |
Compact the non-zero elements of an array.
Given an input array d_in, compactArray() outputs a compacted version which does not have null (zero) elements. Also ouputs the number of non-zero elements in the compacted array. Called by cudppCompactDispatch().
The algorithm is straightforward, involving two steps (most of the complexity is hidden in scan, invoked with cudppScanDispatch() ).
[out] | d_out | Array of compacted non-null elements |
[out] | d_numValidElements | Pointer to unsigned int to store number of non-null elements |
[in] | d_in | Input array |
[out] | d_isValid | Array of flags, 1 for each non-null element, 0 for each null element. Same length as d_in |
[in] | numElements | Number of elements in input array |
[in] | plan | Pointer to the plan object used for this compact |
void allocCompactStorage | ( | CUDPPCompactPlan * | plan | ) |
Allocate intermediate arrays used by cudppCompact().
In addition to the internal CUDPPScanPlan contained in CUDPPCompactPlan, CUDPPCompact also needs a temporary device array of output indices, which is allocated by this function.
plan | Pointer to CUDPPCompactPlan object within which intermediate storage is allocated. |
void freeCompactStorage | ( | CUDPPCompactPlan * | plan | ) |
Deallocate intermediate storage used by cudppCompact().
Deallocates the output indices array allocated by allocCompactStorage().
plan | Pointer to CUDPPCompactPlan object initialized by allocCompactStorage(). |
void cudppCompactDispatch | ( | void * | d_out, |
size_t * | d_numValidElements, | ||
const void * | d_in, | ||
const unsigned int * | d_isValid, | ||
size_t | numElements, | ||
const CUDPPCompactPlan * | plan | ||
) |
Dispatch compactArray for the specified datatype.
A thin wrapper on top of compactArray which calls compactArray() for the data type specified in config. This is the app-level interface to compact used by cudppCompact().
[out] | d_out | Compacted array of non-zero elements |
[out] | d_numValidElements | Pointer to an unsigned int to store the number of non-zero elements |
[in] | d_in | Input array |
[in] | d_isValid | Array of boolean valid flags with same length as d_in |
[in] | numElements | Number of elements to compact |
[in] | plan | Pointer to plan object for this compact |
void huffmanEncoding | ( | unsigned int * | d_hist, |
unsigned int * | d_encodeOffset, | ||
unsigned int * | d_compressedSize, | ||
unsigned int * | d_compressed, | ||
size_t | numElements, | ||
const CUDPPCompressPlan * | plan | ||
) |
Perform Huffman encoding.
Performs Huffman encoding on the input data stream. The input data stream is the output data stream from the previous stage (MTF) in our compress stream.
The input is given by the output of the Move-to-Front transform (MTF). There are a few things that need to be store along with the compressed data. We also store the word offset of the compressed data stream because our data is compressed into indepedent blocks (word granularity) so that they can be encoded and decoded in parallel. The number of independent blocks is HUFF_THREADS_PER_BLOCK*HUFF_WORK_PER_THREAD.
[out] | d_hist | Histogram array of the input data stream used for decoding. |
[out] | d_encodeOffset | An array of the word offsets of the independent compressed data blocks. |
[out] | d_compressedSize | Pointer to the total size in words of all compressed data blocks combined. |
[out] | d_compressed | A pointer to the compressed data blocks. |
[in] | numElements | Total number of input elements to compress. |
[in] | plan | Pointer to the plan object used for this compress. |
void moveToFrontTransform | ( | unsigned char * | d_mtfIn, |
unsigned char * | d_mtfOut, | ||
size_t | numElements, | ||
const T * | plan | ||
) |
Perform the Move-to-Front Transform (MTF)
Performs a Move-to-Front (MTF) transform on the input data stream. The MTF transform is the second stage in our compress pipeline. The MTF manipulates the input data stream to improve the performance of entropy encoding.
[in] | d_mtfIn | An array of the input data stream to perform the MTF transform on. |
[out] | d_mtfOut | An array to store the output of the MTF transform. |
[in] | numElements | Total number of input elements of the MTF transform. |
[in] | plan | Pointer to the plan object used for this MTF transform. |
void burrowsWheelerTransform | ( | unsigned char * | d_uncompressed, |
int * | d_bwtIndex, | ||
unsigned char * | d_bwtOut, | ||
size_t | numElements, | ||
const T * | plan | ||
) |
Perform the Burrows-Wheeler Transform (BWT)
Performs the Burrows-Wheeler Transform (BWT) on a given character string. The BWT is an algorithm which is commonly used in compression applications, mainly bzip2. The BWT orders the characters in such a way that the output tends to have many long runs of repeated characters. This bodes well for later stages in compression pipelines which perform better with repeated characters.
[in] | d_uncompressed | A char array of the input data stream to perform the BWT on. |
[out] | d_bwtIndex | The index at which the original string in the BWT sorts to. |
[out] | d_bwtOut | An array to store the output of the BWT. |
[in] | numElements | Total number of input elements of the BWT. |
[in] | plan | Pointer to the plan object used for this BWT. |
void burrowsWheelerTransformWrapper | ( | unsigned char * | d_in, |
int * | d_bwtIndex, | ||
size_t | numElements, | ||
const CUDPPCompressPlan * | plan | ||
) |
Wrapper for calling the Burrows-Wheeler Transform (BWT).
This is a wrapper function for calling the BWT. This wrapper is used internally via the compress application to call burrowsWheelerTransform().
[in] | d_in | A char array of the input data stream to perform the BWT on. |
[out] | d_bwtIndex | The index at which the original string in the BWT sorts to. |
[in] | numElements | Total number of input elements to the compress stream. |
[in] | plan | Pointer to the plan object used for this compress. |
void burrowsWheelerTransformWrapper | ( | unsigned char * | d_in, |
int * | d_bwtIndex, | ||
unsigned char * | d_bwtOut, | ||
size_t | numElements, | ||
const CUDPPBwtPlan * | plan | ||
) |
Wrapper for calling the Burrows-Wheeler Transform (BWT).
This is a wrapper function for calling the BWT. This wrapper is used internally via the BWT primitive to call burrowsWheelerTransform().
[in] | d_in | A char array of the input data stream to perform the BWT on. |
[out] | d_bwtIndex | The index at which the original string in the BWT sorts to. |
[out] | d_bwtOut | An array to store the output of the BWT. |
[in] | numElements | Total number of input elements to the BWT. |
[in] | plan | Pointer to the plan object used for this BWT. |
void moveToFrontTransformWrapper | ( | size_t | numElements, |
const CUDPPCompressPlan * | plan | ||
) |
Wrapper for calling the Move-to-Front (MTF) transform.
This is a wrapper function for calling the MTF. This wrapper is used internally via the compress application to call moveToFrontTransform().
[in] | numElements | Total number of input elements to the MTF transform. |
[in] | plan | Pointer to the plan object used for this compress. |
void moveToFrontTransformWrapper | ( | unsigned char * | d_in, |
unsigned char * | d_mtfOut, | ||
size_t | numElements, | ||
const CUDPPMtfPlan * | plan | ||
) |
Wrapper for calling the Move-to-Front (MTF) transform.
This is a wrapper function for calling the MTF. This wrapper is used internally via the MTF primitive to call moveToFrontTransform().
[in] | d_in | An input char array to perform the MTF on. |
[in] | d_mtfOut | An output char array to store the MTF transformed stream. |
[in] | numElements | Total number of input elements to the MTF transform. |
[in] | plan | Pointer to the plan object used for this MTF. |
void allocBwtStorage | ( | CUDPPBwtPlan * | plan | ) |
Allocate intermediate arrays used by BWT.
[in,out] | plan | Pointer to CUDPPBwtPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated. |
void allocMtfStorage | ( | CUDPPMtfPlan * | plan | ) |
Allocate intermediate arrays used by MTF.
[in,out] | plan | Pointer to CUDPPMtfPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated. |
void allocCompressStorage | ( | CUDPPCompressPlan * | plan | ) |
Allocate intermediate arrays used by compression.
[in,out] | plan | Pointer to CUDPPCompressPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated. |
void freeCompressStorage | ( | CUDPPCompressPlan * | plan | ) |
Deallocate intermediate block arrays in a CUDPPCompressPlan object.
[in,out] | plan | Pointer to CUDPPCompressPlan object initialized by allocCompressStorage(). |
void freeBwtStorage | ( | CUDPPBwtPlan * | plan | ) |
Deallocate intermediate block arrays in a CUDPPBwtPlan object.
[in,out] | plan | Pointer to CUDPPBwtPlan object initialized by allocBwtStorage(). |
void freeMtfStorage | ( | CUDPPMtfPlan * | plan | ) |
Deallocate intermediate block arrays in a CUDPPMtfPlan object.
[in,out] | plan | Pointer to CUDPPMtfPlan object initialized by allocMtfStorage(). |
void cudppCompressDispatch | ( | void * | d_uncompressed, |
void * | d_bwtIndex, | ||
void * | d_histSize, | ||
void * | d_hist, | ||
void * | d_encodeOffset, | ||
void * | d_compressedSize, | ||
void * | d_compressed, | ||
size_t | numElements, | ||
const CUDPPCompressPlan * | plan | ||
) |
Dispatch function to perform parallel compression on an array with the specified configuration.
[in] | d_uncompressed | Uncompressed data |
[out] | d_bwtIndex | BWT Index |
[out] | d_histSize | Histogram size |
[out] | d_hist | Histogram |
[out] | d_encodeOffset | Encoded offset table |
[out] | d_compressedSize | Size of compressed data |
[out] | d_compressed | Compressed data |
[in] | numElements | Number of elements to compress |
[in] | plan | Pointer to CUDPPCompressPlan object containing compress options and intermediate storage |
void cudppBwtDispatch | ( | void * | d_in, |
void * | d_out, | ||
void * | d_index, | ||
size_t | numElements, | ||
const CUDPPBwtPlan * | plan | ||
) |
Dispatch function to perform the Burrows-Wheeler transform.
[in] | d_in | Input data |
[out] | d_out | Transformed data |
[out] | d_index | BWT Index |
[in] | numElements | Number of elements to compress |
[in] | plan | Pointer to CUDPPBwtPlan object containing compress options and intermediate storage |
void cudppMtfDispatch | ( | void * | d_in, |
void * | d_out, | ||
size_t | numElements, | ||
const CUDPPMtfPlan * | plan | ||
) |
Dispatch function to perform the Move-to-Front transform.
[in] | d_in | Input data |
[out] | d_out | Transformed data |
[in] | numElements | Number of elements to compress |
[in] | plan | Pointer to CUDPPMtfPlan object containing compress options and intermediate storage |
void listRank | ( | T * | d_ranked_values, |
T * | d_unranked_values, | ||
int * | d_next_indices, | ||
size_t | head, | ||
size_t | numElements, | ||
const CUDPPListRankPlan * | plan | ||
) |
Launch list ranking.
Given two inputs arrays, d_unranked_values and d_next_indices, listRank() outputs a ranked version of the unranked values by traversing the next indices. The head index is head. Called by cudppListRankDispatch().
[out] | d_ranked_values | Ranked values array |
[in] | d_unranked_values | Unranked values array |
[in] | d_next_indices | Next indices array |
[in] | head | Head pointer index |
[in] | numElements | Number of nodes values to rank |
[in] | plan | Pointer to CUDPPListRankPlan object containing list ranking options and intermediate storage |
void allocListRankStorage | ( | CUDPPListRankPlan * | plan | ) |
Allocate intermediate arrays used by ListRank.
CUDPPListRank needs temporary device arrays to store next indices so that it does not overwrite the original array.
[in,out] | plan | Pointer to CUDPPListRankPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated. |
void freeListRankStorage | ( | CUDPPListRankPlan * | plan | ) |
Deallocate intermediate block arrays in a CUDPPListRankPlan object.
Deallocates the output indices array allocated by allocListRankStorage().
[in,out] | plan | Pointer to CUDPPListRankPlan object initialized by allocListRankStorage(). |
CUDPPResult cudppListRankDispatch | ( | void * | d_ranked_values, |
void * | d_unranked_values, | ||
void * | d_next_indices, | ||
size_t | head, | ||
size_t | numElements, | ||
const CUDPPListRankPlan * | plan | ||
) |
Dispatch function to perform parallel list ranking on a linked-list with the specified configuration.
A wrapper on top of listRank which calls listRank() for the data type specified in config. This is the app-level interface to list ranking used by cudppListRank().
[out] | d_ranked_values | Ranked values array |
[in] | d_unranked_values | Unranked values array |
[in] | d_next_indices | Next indices array |
[in] | head | Head pointer index |
[in] | numElements | Number of nodes values to rank |
[in] | plan | Pointer to CUDPPListRankPlan object containing list ranking options and intermediate storage |
void runMergeSort | ( | T * | pkeys, |
unsigned int * | pvals, | ||
size_t | numElements, | ||
const CUDPPMergeSortPlan * | plan | ||
) |
Performs merge sort utilizing 3 stages: (1) Blocksort, (2) simple merge and (3) multi merge.
[in,out] | pkeys | Keys to be sorted. |
[in,out] | pvals | Associated values to be sorted |
[in] | numElements | Number of elements in the sort. |
[in] | plan | Configuration information for mergesort. |
void allocMergeSortStorage | ( | CUDPPMergeSortPlan * | plan | ) |
From the programmer-specified sort configuration, creates internal memory for performing the sort.
[in] | plan | Pointer to CUDPPMergeSortPlan object |
void freeMergeSortStorage | ( | CUDPPMergeSortPlan * | plan | ) |
Deallocates intermediate memory from allocRadixSortStorage.
[in] | plan | Pointer to CUDPPMergeSortPlan object |
void cudppMergeSortDispatch | ( | void * | keys, |
void * | values, | ||
size_t | numElements, | ||
const CUDPPMergeSortPlan * | plan | ||
) |
Dispatch function to perform a sort on an array with a specified configuration.
This is the dispatch routine which calls mergeSort...() with appropriate template parameters and arguments as specified by the plan. Currently only sorts keys of type int, unsigned int, and float.
[in,out] | keys | Keys to be sorted. |
[in,out] | values | Associated values to be sorted (through keys). |
[in] | numElements | Number of elements in the sort. |
[in] | plan | Configuration information for mergeSort. |
void multisplit_key_only | ( | key_type * | d_key_in, |
key_type * | d_key_out, | ||
size_t | num_elements, | ||
uint32_t | num_buckets, | ||
multisplit_context & | context, | ||
bucket_t | bucket_identifier, | ||
bool | in_place, | ||
uint32_t * | bucket_offsets = NULL |
||
) |
Performs multisplit on keys only.
This function performs multisplit on a list of keys for a number of buckets less than or equal to 32. If the number of buckets is less than a threshold, a warp-level multisplit is used. If the number of buckets is greater than the threshold, a block-level multisplit is used. This function also supports copying the results of multisplit back into the input array. In addition, the offset indices marking the locations of the buckets in the result array can optionally be saved.
[in,out] | d_key_in | Input keys to be multisplit. |
[out] | d_key_out | Output keys after multisplit. |
[in] | num_elements | Number of elements. |
[in] | num_buckets | Number of buckets. |
[in] | context | Intermediate data storage for multisplit. |
[in] | bucket_identifier | Functor to map an element to a bucket number. |
[in] | in_place | Flag to indicate if results are copied back to the input. |
[out] | bucket_offsets | Optional output list of bucket indices. |
void multisplit_key_value | ( | key_type * | d_key_in, |
value_type * | d_value_in, | ||
key_type * | d_key_out, | ||
value_type * | d_value_out, | ||
size_t | num_elements, | ||
uint32_t | num_buckets, | ||
multisplit_context & | context, | ||
bucket_t | bucket_identifier, | ||
bool | in_place, | ||
uint32_t * | bucket_offsets = NULL |
||
) |
Performs multisplit on key-value pairs.
This function performs multisplit on a list of keys and a list of values for a number of buckets less than or equal to 32. If the number of buckets is less than a threshold, a warp-level multisplit is used. If the number of buckets is greater than the threshold, a block-level multisplit is used. This function also supports copying the results of multisplit back into the key and value input arrays. In addition, the offset indices marking the locations of the buckets in the result arrays can optionally be saved.
[in,out] | d_key_in | Input keys to be multisplit. |
[in,out] | d_value_in | Input values to be multisplit along with keys. |
[out] | d_key_out | Output keys after multisplit. |
[out] | d_value_out | Output keys after multisplit. |
[in] | num_elements | Number of elements. |
[in] | num_buckets | Number of buckets. |
[in] | context | Intermediate data storage for multisplit. |
[in] | bucket_identifier | Functor to map an element to a bucket number. |
[in] | in_place | Flag to indicate if results are copied back to inputs. |
[out] | bucket_offsets | Optional output list of bucket indices. |
void reducedBitSortKeysOnly | ( | unsigned int * | d_inp, |
uint | numElements, | ||
uint | numBuckets, | ||
T | bucketMapper, | ||
const CUDPPMultiSplitPlan * | plan | ||
) |
Performs multisplit on keys only using the reduced-bit sort method.
This function uses radix sort to perform a multisplit. It is suitable when the number of buckets is large.
[in,out] | d_inp | Keys to be multisplit. |
[in] | numElements | Number of elements. |
[in] | numBuckets | Number of buckets. |
[in] | bucketMapper | Functor that maps an element to a bucket number. |
[in] | plan | Configuration plan for multisplit. |
void reducedBitSortKeyValue | ( | unsigned int * | d_keys, |
unsigned int * | d_values, | ||
unsigned int | numElements, | ||
unsigned int | numBuckets, | ||
T | bucketMapper, | ||
const CUDPPMultiSplitPlan * | plan | ||
) |
Performs multisplit on key-value pairs using a reduced-bit sort.
This function uses radix sort to perform a multisplit on a list of keys and a list of values. It is suitable when the number of buckets is large.
[in,out] | d_keys | Keys to be multisplit. |
[in,out] | d_values | Associated values to be multisplit |
[in] | numElements | Number of key-value pairs. |
[in] | numBuckets | Number of buckets. |
[in] | bucketMapper | Functor that maps an element to a bucket number. |
[in] | plan | Configuration information for multisplit. |
void allocMultiSplitStorage | ( | CUDPPMultiSplitPlan * | plan | ) |
From the programmer-specified multisplit configuration, creates internal memory for performing the multisplit. Different storage amounts are required depending on the number of buckets.
[in] | plan | Pointer to CUDPPMultiSplitPlan object |
void freeMultiSplitStorage | ( | CUDPPMultiSplitPlan * | plan | ) |
Deallocates intermediate memory from allocMultiSplitStorage.
[in] | plan | Pointer to CUDPPMultiSplitPlan object |
void cudppMultiSplitDispatch | ( | unsigned int * | d_keys, |
unsigned int * | d_values, | ||
size_t | numElements, | ||
size_t | numBuckets, | ||
BucketMappingFunc | bucketMappingFunc, | ||
const CUDPPMultiSplitPlan * | plan | ||
) |
Dispatch function to perform multisplit on an array of elements into a number of buckets.
This is the dispatch routine which calls multiSplit...() with appropriate parameters, including the bucket mapping function specified by plan's configuration.
Currently only splits unsigned integers.
[in,out] | keys | Keys to be split. |
[in,out] | values | Optional associated values to be split (through keys), can be NULL. |
[in] | numElements | Number of elements to be split. |
[in] | plan | Configuration information for multiSplit. |
void allocRadixSortStorage | ( | CUDPPRadixSortPlan * | plan | ) |
From the programmer-specified sort configuration, creates internal memory for performing the sort.
[in] | plan | Pointer to CUDPPRadixSortPlan object |
void freeRadixSortStorage | ( | CUDPPRadixSortPlan * | plan | ) |
Deallocates intermediate memory from allocRadixSortStorage.
[in] | plan | Pointer to CUDPPRadixSortPlan object |
void cudppRadixSortDispatch | ( | void * | keys, |
void * | values, | ||
size_t | numElements, | ||
const CUDPPRadixSortPlan * | plan | ||
) |
Dispatch function to perform a sort on an array with a specified configuration.
This is the dispatch routine which calls radixSort...() with appropriate template parameters and arguments as specified by the plan.
[in,out] | keys | Keys to be sorted. |
[in,out] | values | Associated values to be sorted (through keys). |
[in] | numElements | Number of elements in the sort. |
[in] | plan | Configuration information for RadixSort. |
void KeyValueSort | ( | unsigned int | num_elements, |
unsigned int * | d_keys, | ||
unsigned int * | d_values | ||
) |
Radix Sort kernel from NVlab cub library.
[in] | num_elements | Number of elements to sort. |
[in] | d_keys | Key values of the elements in the array to be sorted. |
[in,out] | d_values | Positions of the elements in the array. |
void ComputeSA | ( | unsigned int * | d_str, |
unsigned int * | d_keys_sa, | ||
size_t | str_length, | ||
mgpu::CudaContext & | context, | ||
CUDPPSaPlan * | plan, | ||
unsigned int | offset, | ||
unsigned int | stage | ||
) |
Perform Suffix Array (SA) using skew algorithm.
Performs recursive skew kernel on a given character string. A suffix array is a sorted array of all suffixes of a string. Skew algorithm is a linear-time algorithm based on divde and conquer. The SA of a string can be used as an index to quickly locate every occurrence of a substring pattern within the string. Suffix sorting algorithms can be used to compute the Burrows-Wheeler Transform(BWT). The BWT requires sorting of all cyclic permutations of a string, thus can be computed in linear time by using a suffix array of the string.
[in] | d_str | An unsigned int array of the input data stream to perform the SA on. |
[out] | d_keys_sa | An array to store the output of the SA. |
[in] | str_length | Total number of input elements. |
[in] | context | Context format required by mgpu functions. |
[in,out] | plan | Pointer to the plan object used for this suffix array. |
[in,out] | offset | Offset to move head pointer to find the memory for each iteration. |
[in,out] | stage | Stage for each iteration. |
void allocSaStorage | ( | CUDPPSaPlan * | plan | ) |
Allocate intermediate arrays used by suffix array.
[in,out] | plan | Pointer to CUDPPSaPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated. |
void freeSaStorage | ( | CUDPPSaPlan * | plan | ) |
Deallocate intermediate block arrays in a CUDPPSaPlan object.
[in,out] | plan | Pointer to CUDPPSaPlan object initialized by allocSaStorage(). |
void cudppSuffixArrayDispatch | ( | void * | d_str, |
unsigned int * | d_keys_sa, | ||
size_t | d_str_length, | ||
CUDPPSaPlan * | plan | ||
) |
Dispatch function to perform parallel suffix array on a string with the specified configuration.
[in] | d_str | input string with three $ |
[out] | d_keys_sa | lexicographically sorted suffix position array |
[in] | d_str_length | Number of elements in the string including $ |
[in] | plan | Pointer to CUDPPSaPlan object containing suffix_array options and intermediate storage |
void scanArrayRecursive | ( | T * | d_out, |
const T * | d_in, | ||
T ** | d_blockSums, | ||
size_t | numElements, | ||
size_t | numRows, | ||
const size_t * | rowPitches, | ||
int | level | ||
) |
Perform recursive scan on arbitrary size arrays.
This is the CPU-side workhorse function of the scan engine. This function invokes the CUDA kernels which perform the scan on individual blocks.
Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans, where each block is scanned by a single CUDA thread block. At each recursive level of the scanArrayRecursive first invokes a kernel to scan all blocks of that level, and if the level has more than one block, it calls itself recursively. On returning from each recursive level, the total sum of each block from the level below is added to all elements of the corresponding block in this level. See "Parallel Prefix Sum (Scan) in CUDA" for more information (see References ).
Template parameter T is the datatype; isBackward specifies backward or forward scan; isExclusive specifies exclusive or inclusive scan, and op specifies the binary associative operator to be used.
[out] | d_out | The output array for the scan results |
[in] | d_in | The input array to be scanned |
[out] | d_blockSums | Array of arrays of per-block sums (one array per recursive level, allocated by allocScanStorage()) |
[in] | numElements | The number of elements in the array to scan |
[in] | numRows | The number of rows in the array to scan |
[in] | rowPitches | Array of row pitches (one array per recursive level, allocated by allocScanStorage()) |
[in] | level | The current recursive level of the scan |
void allocScanStorage | ( | CUDPPScanPlan * | plan | ) |
Allocate intermediate arrays used by scan.
Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans, where each block is scanned by a single CUDA thread block. At each recursive level of the scan, we need an array in which to store the total sums of all blocks in that level. This function computes the amount of storage needed and allocates it.
plan | Pointer to CUDPPScanPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated. |
void freeScanStorage | ( | CUDPPScanPlan * | plan | ) |
Deallocate intermediate block sums arrays in a CUDPPScanPlan object.
These arrays must have been allocated by allocScanStorage(), which is called by the constructor of cudppScanPlan().
plan | Pointer to CUDPPScanPlan object initialized by allocScanStorage(). |
void cudppScanDispatch | ( | void * | d_out, |
const void * | d_in, | ||
size_t | numElements, | ||
size_t | numRows, | ||
const CUDPPScanPlan * | plan | ||
) |
Dispatch function to perform a scan (prefix sum) on an array with the specified configuration.
This is the dispatch routine which calls scanArrayRecursive() with appropriate template parameters and arguments to achieve the scan as specified in plan.
[out] | d_out | The output array of scan results |
[in] | d_in | The input array |
[in] | numElements | The number of elements to scan |
[in] | numRows | The number of rows to scan in parallel |
[in] | plan | Pointer to CUDPPScanPlan object containing scan options and intermediate storage |
void runStringSort | ( | unsigned int * | pkeys, |
unsigned int * | pvals, | ||
unsigned int * | stringVals, | ||
size_t | numElements, | ||
size_t | stringArrayLength, | ||
unsigned char | termC, | ||
const CUDPPStringSortPlan * | plan | ||
) |
Performs merge sor utilzing three stages. (1) Blocksort, (2) simple merge and (3) multi merge on a set of strings.
[in,out] | pkeys | Keys (first four characters of string) to be sorted. |
[in,out] | pvals | Addresses of string locations for tie-breaks |
[out] | stringVals | global string value array (four characters stuffed into a uint) |
[in] | numElements | Number of elements in the sort. |
[in] | stringArrayLength | The size of our string array in uints (4 chars per uint) |
[in] | plan | Configuration information for mergesort. |
[in] | termC | Termination character for our strings |
void allocStringSortStorage | ( | CUDPPStringSortPlan * | plan | ) |
From the programmer-specified sort configuration, creates internal memory for performing the sort.
[in] | plan | Pointer to CUDPPStringSortPlan object |
void freeStringSortStorage | ( | CUDPPStringSortPlan * | plan | ) |
Deallocates intermediate memory from allocStringSortStorage.
[in] | plan | Pointer to CUDPStringSortPlan object |
void cudppStringSortDispatch | ( | unsigned int * | keys, |
unsigned int * | values, | ||
unsigned int * | stringVals, | ||
size_t | numElements, | ||
size_t | stringArrayLength, | ||
unsigned char | termC, | ||
const CUDPPStringSortPlan * | plan | ||
) |
Dispatch function to perform a sort on an array with a specified configuration.
This is the dispatch routine which calls stringSort...() with appropriate template parameters and arguments as specified by the plan.
[in,out] | keys | Keys (first four chars of string) to be sorted. |
[in,out] | values | Address of string values in array of null terminated strings |
[in] | stringVals | Global string array |
[in] | numElements | Number of elements in the sort. |
[in] | stringArrayLength | The size of our string array in uints (4 chars per uint) |
[in] | termC | Termination character for our strings |
[in] | plan | Configuration information for mergeSort. |
void crpcr | ( | T * | d_a, |
T * | d_b, | ||
T * | d_c, | ||
T * | d_d, | ||
T * | d_x, | ||
unsigned int | systemSizeOriginal, | ||
unsigned int | numSystems | ||
) |
Hybrid CR-PCR solver (CRPCR)
This is a wrapper function for the GPU CR-PCR kernel.
[out] | d_x | Solution vector |
[in] | d_a | Lower diagonal |
[in] | d_b | Main diagonal |
[in] | d_c | Upper diagonal |
[in] | d_d | Right hand side |
[in] | systemSizeOriginal | The size of the linear system |
[in] | numSystems | The number of systems to be solved |
CUDPPResult cudppTridiagonalDispatch | ( | void * | d_a, |
void * | d_b, | ||
void * | d_c, | ||
void * | d_d, | ||
void * | d_x, | ||
int | systemSize, | ||
int | numSystems, | ||
const CUDPPTridiagonalPlan * | plan | ||
) |
Dispatches the tridiagonal function based on the plan.
This is the dispatch call for the tridiagonal solver in either float or double datatype.
[out] | d_x | Solution vector |
[in] | d_a | Lower diagonal |
[in] | d_b | Main diagonal |
[in] | d_c | Upper diagonal |
[in] | d_d | Right hand side |
[in] | systemSize | The size of the linear system |
[in] | numSystems | The number of systems to be solved |
[in] | plan | pointer to CUDPPTridiagonalPlan |