CUDPP  2.3
CUDA Data-Parallel Primitives Library
Functions
CUDPP Kernel-Level API

Functions

__global__ void alignedOffsets (unsigned int *numSpaces, unsigned int *d_address, unsigned char *d_stringVals, unsigned char termC, unsigned int numElements, unsigned int stringSize)
 Calculate the number of spaces required for each string to align the string array. More...
 
__global__ void alignString (unsigned int *packedStrings, unsigned char *d_stringVals, unsigned int *packedAddress, unsigned int *address, unsigned int numElements, unsigned int stringArrayLength, unsigned char termC)
 Packs strings into unsigned ints to be sorted later. These packed strings will also be aligned. More...
 
__global__ void createKeys (unsigned int *d_keys, unsigned int *packedStrings, unsigned int *packedAddress, unsigned int numElements)
 Create keys (first four characters stuffed in an uint) from the addresses to the strings, and the string array. More...
 
__global__ void unpackAddresses (unsigned int *packedAddress, unsigned int *packedAddressRef, unsigned int *address, unsigned int *addressRef, size_t numElements)
 Converts addresses from packed (unaligned) form to unpacked and unaligned form Resulting aligned strings begin in our string array packed in an unsigned int and aligned such that each string begins at the start of a uint (divisible by 4) More...
 
template<class T , int depth>
__global__ void blockWiseStringSort (T *A_keys, T *A_address, T *stringVals, int blockSize, int totalSize, unsigned int stringSize, unsigned char termC)
 Does an initial blockSort based on the size of our partition (limited by shared memory size) More...
 
template<class T , int depth>
__global__ void simpleStringMerge (T *A_keys, T *A_keys_out, T *A_values, T *A_values_out, T *stringValues, int sizePerPartition, int size, int step, int stringSize, unsigned char termC)
 Merges two independent sets. Each CUDA block works on two partitions of data without cooperating. More...
 
template<class T >
__global__ void findMultiPartitions (T *A_keys, T *A_address, T *stringValues, int splitsPP, int numPartitions, int partitionSize, unsigned int *partitionBeginA, unsigned int *partitionSizesA, unsigned int *partitionBeginB, unsigned int *partitionSizesB, size_t size, size_t stringSize, unsigned char termC)
 For our multiMerge kernels we need to divide our partitions into smaller partitions. This kernel breaks up a set of partitions into splitsPP*numPartitions subpartitions. More...
 
template<class T , int depth>
__global__ void stringMergeMulti (T *A_keys, T *A_keys_out, T *A_values, T *A_values_out, T *stringValues, int subPartitions, int numBlocks, unsigned int *partitionBeginA, unsigned int *partitionSizeA, unsigned int *partitionBeginB, unsigned int *partitionSizeB, int entirePartitionSize, int step, size_t size, size_t stringSize, unsigned char termC)
 Main merge kernel where multiple CUDA blocks cooperate to merge a partition(s) More...
 

Compact Functions

template<class T , bool isBackward>
__global__ void compactData (T *d_out, size_t *d_numValidElements, const unsigned int *d_indices, const unsigned int *d_isValid, const T *d_in, unsigned int numElements)
 Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray(). More...
 

Compress Functions

typedef unsigned int uint
 
typedef unsigned char uchar
 
typedef unsigned short ushort
 
__global__ void bwt_compute_final_kernel (const uchar *d_bwtIn, const uint *d_values, int *d_bwtIndex, uchar *d_bwtOut, uint numElements, uint tThreads)
 Compute final BWT. More...
 
template<class T , int depth>
__global__ void stringMergeMulti (T *A_keys, T *A_keys_out, T *A_values, T *A_values_out, T *stringValues, int subPartitions, int numBlocks, int *partitionBeginA, int *partitionSizeA, int *partitionBeginB, int *partitionSizeB, int entirePartitionSize, size_t numElements)
 Multi merge. More...
 
template<class T >
__global__ void findMultiPartitions (T *A, int splitsPP, int numPartitions, int partitionSize, int *partitionBeginA, int *partitionSizesA, int *partitionBeginB, int *partitionSizesB, int sizeA)
 Merges the indices for the "upper" block (right block) More...
 
template<class T , int depth>
__global__ void simpleStringMerge (T *A_keys, T *A_keys_out, T *A_values, T *A_values_out, T *stringValues, int sizePerPartition, int size, T *stringValues2, size_t numElements)
 Simple merge. More...
 
template<class T , int depth>
__global__ void blockWiseStringSort (T *A_keys, T *A_address, const T *stringVals, T *stringVals2, int blockSize, size_t numElements)
 Sorts blocks of data of size blockSize. More...
 
__global__ void bwt_keys_construct_kernel (uchar4 *d_bwtIn, uint *d_bwtInRef, uint *d_keys, uint *d_values, uint *d_bwtInRef2, uint tThreads)
 Massage input to set up for merge sort. More...
 
__global__ void mtf_reduction_kernel (const uchar *d_mtfIn, uchar *d_lists, ushort *d_list_sizes, uint nLists, uint offset, uint numElements)
 First stage in MTF (Reduction) More...
 
__global__ void mtf_GLreduction_kernel (uchar *d_lists, ushort *d_list_sizes, uint offset, uint tThreads, uint nLists)
 Second stage in MTF (Global reduction) More...
 
__global__ void mtf_GLdownsweep_kernel (uchar *d_lists, ushort *d_list_sizes, uint offset, uint lastLevel, uint nLists, uint tThreads)
 Third stage in MTF (Global downsweep) More...
 
__global__ void mtf_localscan_lists_kernel (const uchar *d_mtfIn, uchar *d_mtfOut, uchar *d_lists, ushort *d_list_sizes, uint nLists, uint offset, uint numElements)
 Compute final MTF lists and final MTF output. More...
 
__global__ void huffman_build_histogram_kernel (uint *d_input, uint *d_histograms, uint numElements)
 Compute 256-entry histogram. More...
 
__global__ void histo_kernel (uchar *d_input, uint *d_histograms, uint numElements)
 
__global__ void huffman_build_tree_kernel (const uchar *d_input, uchar *d_huffCodesPacked, uint *d_huffCodeLocations, uchar *d_huffCodeLengths, uint *d_histograms, uint *d_histogram, uint *d_nCodesPacked, uint *d_totalEncodedSize, uint histBlocks, uint numElements)
 Build Huffman tree/codes. More...
 
__global__ void huffman_kernel_en (uchar4 *d_input, uchar *d_codes, uint *d_code_locations, uchar *d_huffCodeLengths, encoded *d_encoded, uint nCodesPacked, uint nThreads)
 Perform parallel Huffman encoding. More...
 
__global__ void huffman_datapack_kernel (encoded *d_encoded, uint *d_encodedData, uint *d_totalEncodedSize, uint *d_eOffsets)
 Pack together encoded blocks. More...
 

ListRank Functions

typedef unsigned int uint
 
typedef unsigned char uchar
 
typedef unsigned short ushort
 
template<typename T >
__global__ void list_rank_kernel_soa_1 (T *d_ranked_values, const T *d_unranked_values, const int *d_ping, int *d_pong, int *d_start_indices, int step, int head, int numElts)
 Use pointer jumping to rank values. After ranking the values, calculate the next set of indices. The number of values ranked doubles at each kernel call. Called by listRank(). More...
 
template<typename T >
__global__ void list_rank_kernel_soa_2 (T *d_ranked_values, const T *d_unranked_values, const int *d_pong, const int *d_start_indices, int head, int numElts)
 After pointer jumping is finished and all threads are able to rank values, ranking continues serially. Each thread ranks values until all values are ranked. Called by listRank(). More...
 

MergeSort Functions

typedef unsigned int uint
 
template<class T >
__global__ void simpleCopy (T *A_keys_dev, unsigned int *A_vals_dev, T *A_keys_out_dev, unsigned int *A_vals_out_dev, int offset, int numElementsToCopy)
 Copies unused portions of arrays in our ping-pong strategy. More...
 
template<class T , int depth>
__global__ void blockWiseSort (T *A_keys, unsigned int *A_values, int blockSize, size_t totalSize)
 Sorts blocks of data of size blockSize. More...
 
template<class T , int depth>
__global__ void simpleMerge_lower (T *A_keys, unsigned int *A_values, T *A_keys_out, unsigned int *A_values_out, int sizePerPartition, int size)
 Merges the indices for the "lower" block (left block) More...
 
template<class T , int depth>
__global__ void simpleMerge_higher (T *A_keys, unsigned int *A_values, T *A_keys_out, unsigned int *A_values_out, int sizePerPartition, int size)
 Merges the indices for the "upper" block (right block) More...
 
template<class T >
__global__ void findMultiPartitions (T *A, int splitsPP, int numPartitions, int partitionSize, int *partitionBeginA, int *partitionSizesA, int sizeA)
 Merges the indices for the "upper" block (right block) More...
 
template<class T , int depth>
__global__ void mergeMulti_lower (T *A_keys_out, unsigned int *A_vals_out, T *A_keys, unsigned int *A_vals, int subPartitions, int numBlocks, int *partitionBeginA, int *partitionSizeA, int entirePartitionSize, int sizeA)
 Blocks cooperatively Merge two partitions for the indices in the "lower" block (left block) More...
 
template<class T , int depth>
__global__ void mergeMulti_higher (T *A_keys_out, unsigned int *A_vals_out, T *A_keys, unsigned int *A_vals, int subPartitions, int numBlocks, int *partitionBeginA, int *partitionSizeA, int entirePartitionSize, int sizeA)
 Blocks cooperatively Merge two partitions for the indices in the "upper" block (right block) More...
 

Multisplit Functions

template<class T >
__global__ void markBins_general (uint *d_mark, uint *d_elements, uint numElements, uint numBuckets, T bucketMapper)
 
__global__ void packingKeyValuePairs (uint64 *packed, uint *input_key, uint *input_value, uint numElements)
 
__global__ void unpackingKeyValuePairs (uint64 *packed, uint *out_key, uint *out_value, uint numElements)
 
template<uint32_t NUM_W, uint32_t DEPTH, typename bucket_t , typename key_t >
__global__ void histogram_pre_scan_compaction (key_t *input, uint32_t *bin, uint32_t numElements, bucket_t bucket_identifier)
 
template<uint32_t NUM_W, uint32_t DEPTH, typename bucket_t , typename key_t >
__global__ void split_post_scan_compaction (key_t *key_input, uint32_t *warpOffsets, key_t *key_output, uint32_t numElements, bucket_t bucket_identifier)
 
template<uint32_t NUM_W, uint32_t DEPTH, typename bucket_t , typename key_t , typename value_t >
__global__ void split_post_scan_pairs_compaction (key_t *key_input, value_t *value_input, uint32_t *warpOffsets, key_t *key_output, value_t *value_output, uint32_t numElements, bucket_t bucket_identifier)
 
template<uint32_t NUM_W, uint32_t NUM_B, uint32_t LOG_B, uint32_t DEPTH, typename bucket_t , typename key_type >
__global__ void multisplit_WMS_prescan (key_type *input, uint32_t *bin, uint32_t numElements, bucket_t bucket_identifier)
 
template<uint32_t NUM_W, uint32_t NUM_B, uint32_t LOG_B, uint32_t DEPTH, typename bucket_t , typename key_type >
__global__ void multisplit_WMS_postscan (key_type *key_input, uint32_t *warpOffsets, key_type *key_output, uint32_t numElements, bucket_t bucket_identifier)
 
template<uint32_t NUM_W, uint32_t NUM_B, uint32_t LOG_B, uint32_t DEPTH, typename bucket_t , typename key_type , typename value_type >
__global__ void multisplit_WMS_pairs_postscan (key_type *key_input, value_type *value_input, uint32_t *warpOffsets, key_type *key_output, value_type *value_output, uint32_t numElements, bucket_t bucket_identifier)
 
template<uint32_t NUM_W, uint32_t LOG_W, uint32_t NUM_B, uint32_t LOG_B, uint32_t DEPTH, typename bucket_t , typename key_type >
__global__ void multisplit_BMS_prescan (key_type *input, uint32_t *bin, uint32_t numElements, bucket_t bucket_identifier)
 
template<uint32_t NUM_W, uint32_t LOG_W, uint32_t NUM_B, uint32_t LOG_B, uint32_t DEPTH, typename bucket_t , typename key_type >
__global__ void multisplit_BMS_postscan (key_type *key_input, uint32_t *blockOffsets, key_type *key_output, uint32_t numElements, bucket_t bucket_identifier)
 
template<uint32_t NUM_W, uint32_t LOG_W, uint32_t NUM_B, uint32_t LOG_B, uint32_t DEPTH, typename bucket_t , typename key_type , typename value_type >
__global__ void multisplit_BMS_pairs_postscan (key_type *key_input, value_type *value_input, uint32_t *blockOffsets, key_type *key_output, value_type *value_output, uint32_t numElements, bucket_t bucket_identifier)
 

RadixSort Functions

typedef unsigned int uint
 
__global__ void emptyKernel ()
 And empty kernel used to reset CTA issue hardware.
 
__global__ void flipFloats (uint *values, uint numValues)
 Does special binary arithmetic before sorting floats. More...
 
__global__ void unflipFloats (uint *values, uint numValues)
 Undoes the flips from flipFloats. More...
 
template<bool flip>
__global__ void radixSortSingleWarp (uint *keys, uint *values, uint numElements)
 Optimization for sorts of WARP_SIZE or fewer elements. More...
 
template<bool flip>
__global__ void radixSortSingleWarpKeysOnly (uint *keys, uint numElements)
 Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version. More...
 
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void radixSortBlocks (uint4 *keysOut, uint4 *valuesOut, uint4 *keysIn, uint4 *valuesIn, uint numElements, uint totalBlocks)
 sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements More...
 
template<uint startbit, bool fullBlocks, bool loop>
__global__ void findRadixOffsets (uint2 *keys, uint *counters, uint *blockOffsets, uint numElements, uint totalBlocks)
 Computes the number of keys of each radix in each block stores offset. More...
 
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void reorderData (uint *outKeys, uint *outValues, uint2 *keys, uint2 *values, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks)
 Reorders data in the global array. More...
 
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void radixSortBlocksKeysOnly (uint4 *keysOut, uint4 *keysIn, uint numElements, uint totalBlocks)
 Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements. More...
 
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void reorderDataKeysOnly (uint *outKeys, uint2 *keys, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks)
 Reorders data in the global array. More...
 

Rand Functions

__global__ void gen_randMD5 (uint4 *d_out, size_t numElements, unsigned int seed)
 The main MD5 generation algorithm. More...
 

Reduce Functions

template<typename T , class Oper , unsigned int blockSize, bool nIsPow2>
__global__ void reduce (T *odata, const T *idata, unsigned int n)
 Main reduction kernel. More...
 

Suffix Array Functions

typedef unsigned int uint
 
typedef unsigned char uchar
 
__global__ void strConstruct (uchar *d_str, uint *d_str_value, size_t str_length)
 Construct the input array. More...
 
__global__ void resultConstruct (uint *d_keys_sa, size_t str_length)
 Reconstruct the output. More...
 
__global__ void sa12_keys_construct (uint *d_str, uint *d_keys_uint_12, uint *d_keys_srt_12, int mod_1, size_t tThreads)
 Initialize the SA12 triplets. More...
 
__global__ void sa12_keys_construct_0 (uint *d_str, uint *d_keys_uint_12, uint *d_keys_srt_12, size_t tThreads)
 Construct SA12 for the second radix sort. More...
 
__global__ void sa12_keys_construct_1 (uint *d_str, uint *d_keys_uint_12, uint *d_keys_srt_12, size_t tThreads)
 Construct SA12 for the third radix sort. More...
 
__global__ void compute_rank (uint *d_str, uint *d_keys_srt_12, uint *d_flag, bool *result, size_t tThreads, int str_length)
 Turn on flags for sorted SA12 triplets. More...
 
__global__ void new_str_construct (uint *d_new_str, uint *d_keys_srt_12, uint *d_rank, int mod_1, size_t tThreads)
 Construct new array for recursion. More...
 
__global__ void reconstruct (uint *d_keys_srt_12, uint *d_isa_12, uint *d_flag, int mod_1, size_t tThreads)
 Translate SA12 from recursion. More...
 
__global__ void isa12_construct (uint *d_keys_srt_12, uint *d_isa_12, uint *d_flag, int mod_1, size_t tThreads)
 Construct ISA12. More...
 
__global__ void sa3_srt_construct (uint *d_keys_srt_3, uint *d_str, uint *d_keys_srt_12, uint *d_keys_sa, size_t tThreads1, size_t tThreads2, int str_length)
 Contruct SA3 triplets positions. More...
 
__global__ void sa3_keys_construct (uint *d_keys_srt_3, uint *d_keys_sa, uint *d_str, size_t tThreads, int str_length)
 Construct SA3 triplets keys. More...
 
__global__ void merge_akeys_construct (uint *d_str, uint *d_keys_srt_12, uint *d_isa_12, Vector *d_aKeys, size_t tThreads, int mod_1, int bound, int str_length)
 Construct SA12 keys in terms of Vector. More...
 
__global__ void merge_bkeys_construct (uint *d_str, uint *d_keys_srt_3, uint *d_isa_12, Vector *d_bKeys, size_t tThreads, int mod_1, int bound, int str_length)
 Construct SA3 keys in Vector. More...
 

Scan Functions

template<class T , class traits >
__global__ void scan4 (T *d_out, const T *d_in, T *d_blockSums, int numElements, unsigned int dataRowPitch, unsigned int blockSumRowPitch)
 Main scan kernel. More...
 

Segmented scan Functions

template<class T , class traits >
__global__ void segmentedScan4 (T *d_odata, const T *d_idata, const unsigned int *d_iflags, unsigned int numElements, T *d_blockSums=0, unsigned int *d_blockFlags=0, unsigned int *d_blockIndices=0)
 Main segmented scan kernel. More...
 

Sparse Matrix-Vector multiply Functions

template<class T , bool isFullBlock>
__global__ void sparseMatrixVectorFetchAndMultiply (unsigned int *d_flags, T *d_prod, const T *d_A, const T *d_x, const unsigned int *d_indx, unsigned int numNZElts)
 Fetch and multiply kernel. More...
 
__global__ void sparseMatrixVectorSetFlags (unsigned int *d_flags, const unsigned int *d_rowindx, unsigned int numRows)
 Set Flags kernel. More...
 
template<class T >
__global__ void yGather (T *d_y, const T *d_prod, const unsigned int *d_rowFindx, unsigned int numRows)
 Gather final y values kernel. More...
 

Tridiagonal functions

template<class T >
__global__ void crpcrKernel (T *d_a, T *d_b, T *d_c, T *d_d, T *d_x, unsigned int systemSizeOriginal, unsigned int iterations)
 Hybrid CR-PCR Tridiagonal linear system solver (CRPCR) More...
 

Vector Functions

CUDA kernel methods for basic operations on vectors.

template<class T >
__global__ void vectorAddConstant (T *d_vector, T constant, int n, int baseIndex)
 Adds a constant value to all values in the input d_vector. More...
 
template<class T >
__global__ void vectorAddUniform (T *d_vector, const T *d_uniforms, int numElements, int blockOffset, int baseIndex)
 Add a uniform value to each data element of an array. More...
 
template<typename T >
__global__ void vectorAddUniform2 (T *g_data, T *uniforms, int n, int eltsPerBlock)
 
template<class T , class Oper , int elementsPerThread, bool fullBlocks>
__global__ void vectorAddUniform4 (T *d_vector, const T *d_uniforms, int numElements, int vectorRowPitch, int uniformRowPitch, int blockOffset, int baseIndex)
 Add a uniform value to each data element of an array (vec4 version) More...
 
template<class T >
__global__ void vectorAddVector (T *d_vectorA, const T *d_vectorB, int numElements, int baseIndex)
 Adds together two vectors. More...
 
template<class T , class Oper , bool isLastBlockFull>
__global__ void vectorSegmentedAddUniform4 (T *d_vector, const T *d_uniforms, const unsigned int *d_maxIndices, unsigned int numElements, int blockOffset, int baseIndex)
 Add a uniform value to data elements of an array (vec4 version) More...
 
template<class T , class Oper , bool isLastBlockFull>
__global__ void vectorSegmentedAddUniformToRight4 (T *d_vector, const T *d_uniforms, const unsigned int *d_minIndices, unsigned int numElements, int blockOffset, int baseIndex)
 Add a uniform value to data elements of an array (vec4 version) More...
 

Detailed Description

The CUDPP Kernel-Level API contains functions that run on the GPU device across a grid of Cooperative Thread Array (CTA, aka Thread Block). These kernels are declared global so that they must be invoked from host (CPU) code. They generally invoke GPU device routines in the CUDPP CTA-Level API. Kernel-Level API functions are used by CUDPP Application-Level functions to implement their functionality.

Function Documentation

template<class T , bool isBackward>
__global__ void compactData ( T *  d_out,
size_t *  d_numValidElements,
const unsigned int *  d_indices,
const unsigned int *  d_isValid,
const T *  d_in,
unsigned int  numElements 
)

Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray().

Parameters
[out]d_outOutput array of compacted values.
[out]d_numValidElementsThe number of elements in d_in with valid flags set to 1.
[in]d_indicesPositions where non-null elements will go in d_out.
[in]d_isValidFlags indicating valid (1) and invalid (0) elements. Only valid elements will be copied to d_out.
[in]d_inThe input array
[in]numElementsThe length of the d_in in elements.
__global__ void bwt_compute_final_kernel ( const uchar *  d_bwtIn,
const uint *  d_values,
int *  d_bwtIndex,
uchar *  d_bwtOut,
uint  numElements,
uint  tThreads 
)

Compute final BWT.

This is the final stage in the BWT. This stage computes the final values of the BWT output. It is given the indices of where each of the cyclical rotations of the initial input were sorted to. It uses these indices to figure out the last "column" of the sorted cyclical rotations which is the final BWT output.

Parameters
[in]d_bwtInInput char array to perform the BWT on.
[in]d_valuesInput array that gives the indices of where each of the cyclical rotations of the intial input were sorted to.
[out]d_bwtIndexOutput pointer to store the BWT index. The index tells us where the original string sorted to.
[out]d_bwtOutOutput char array of the BWT.
[in]numElementsThe number of elements we are performing a BWT on.
[in]tThreadsThe total threads we have dispatched on the device.
template<class T , int depth>
__global__ void stringMergeMulti ( T *  A_keys,
T *  A_keys_out,
T *  A_values,
T *  A_values_out,
T *  stringValues,
int  subPartitions,
int  numBlocks,
int *  partitionBeginA,
int *  partitionSizeA,
int *  partitionBeginB,
int *  partitionSizeB,
int  entirePartitionSize,
size_t  numElements 
)

Multi merge.

Parameters
[in]A_keyskeys to be sorted
[out]A_keys_outkeys after being sorted
[in]A_valuesassociated values to keys
[out]A_values_outassociated values after sort
[in]stringValueskeys of each of the cyclical rotations
[in]subPartitionsNumber of blocks working on a partition (number of sub-partitions)
[in]numBlocks
[out]partitionBeginAWhere each partition/subpartition will begin in A
[in]partitionSizeAPartition sizes decided by function findMultiPartitions
[out]partitionBeginBWhere each partition/subpartition will begin in B
[in]partitionSizeBPartition sizes decided by function findMultiPartitions
[in]entirePartitionSizeThe size of an entire partition (before it is split up)
[in]numElementsSize of the enitre array
template<class T >
__global__ void findMultiPartitions ( T *  A,
int  splitsPP,
int  numPartitions,
int  partitionSize,
int *  partitionBeginA,
int *  partitionSizesA,
int *  partitionBeginB,
int *  partitionSizesB,
int  sizeA 
)

Merges the indices for the "upper" block (right block)

Utilizes a "ping-pong" strategy

Parameters
[in]AGlobal array of keys
[in]splitsPPGlobal array of values to be merged
[in]numPartitionsnumber of partitions being considered
[in]partitionSizeSize of each partition being considered
[out]partitionBeginAWhere each partition/subpartition will begin in A
[out]partitionSizesASize of each partition/subpartition in A
[out]partitionBeginBWhere each partition/subpartition will begin in B
[out]partitionSizesBSize of each partition/subpartition in B
[in]sizeASize of the entire array
template<class T , int depth>
__global__ void simpleStringMerge ( T *  A_keys,
T *  A_keys_out,
T *  A_values,
T *  A_values_out,
T *  stringValues,
int  sizePerPartition,
int  size,
T *  stringValues2,
size_t  numElements 
)

Simple merge.

Parameters
[in]A_keyskeys to be sorted
[out]A_keys_outkeys after being sorted
[in]A_valuesassociated values to keys
[out]A_values_outassociated values after sort
[in]stringValuesBWT string manipulated to words
[in]sizePerPartitionSize of each partition being merged
[in]sizeSize of total Array being sorted
[in]stringValues2keys of each of the cyclical rotations
[in]numElementsNumber of elements being sorted
template<class T , int depth>
__global__ void blockWiseStringSort ( T *  A_keys,
T *  A_address,
const T *  stringVals,
T *  stringVals2,
int  blockSize,
size_t  numElements 
)

Sorts blocks of data of size blockSize.

Parameters
[in,out]A_keyskeys to be sorted
[in,out]A_addressassociated values to keys
[in]stringValsBWT string manipulated to words
[in]stringVals2keys of each of the cyclical rotations
[in]blockSizeSize of the chunks being sorted
[in]numElementsSize of the enitre array
__global__ void bwt_keys_construct_kernel ( uchar4 *  d_bwtIn,
uint *  d_bwtInRef,
uint *  d_keys,
uint *  d_values,
uint *  d_bwtInRef2,
uint  tThreads 
)

Massage input to set up for merge sort.

Parameters
[in]d_bwtInA char array of the input data stream to perform the BWT on.
[out]d_bwtInRefBWT string manipulated to words.
[out]d_keysAn array of associated keys to sort by the first four chars of the cyclical rotations.
[out]d_valuesArray of values associates with the keys to sort.
[out]d_bwtInRef2keys of each of the cyclical rotations.
[in]tThreadsPointer to the plan object used for this BWT.
__global__ void mtf_reduction_kernel ( const uchar *  d_mtfIn,
uchar *  d_lists,
ushort *  d_list_sizes,
uint  nLists,
uint  offset,
uint  numElements 
)

First stage in MTF (Reduction)

Parameters
[in]d_mtfInA char array of the input data stream to perform the MTF on.
[out]d_listsA pointer to the start of MTF lists.
[out]d_list_sizesAn array storing the size of each MTF list.
[in]nListsTotal number of MTF lists.
[in]offsetThe offset during the reduction stage. Initialized to two.
[in]numElementsTotal number of input elements MTF transform.
__global__ void mtf_GLreduction_kernel ( uchar *  d_lists,
ushort *  d_list_sizes,
uint  offset,
uint  tThreads,
uint  nLists 
)

Second stage in MTF (Global reduction)

Parameters
[in,out]d_listsA pointer to the start of MTF lists.
[in,out]d_list_sizesAn array storing the size of each MTF list.
[in]offsetThe offset during the reduction stage. Initialized to two.
[in]tThreadsTotal number of threads dispatched.
[in]nListsTotal number of MTF lists.
__global__ void mtf_GLdownsweep_kernel ( uchar *  d_lists,
ushort *  d_list_sizes,
uint  offset,
uint  lastLevel,
uint  nLists,
uint  tThreads 
)

Third stage in MTF (Global downsweep)

Parameters
[in,out]d_listsA pointer to the start of MTF lists.
[in,out]d_list_sizesAn array storing the size of each MTF list.
[in]offsetThe offset during the reduction stage.
[in]lastLevelThe limit to which offset can be set to.
[in]nListsTotal number of MTF lists.
[in]tThreadsTotal number of threads dispatched.
__global__ void mtf_localscan_lists_kernel ( const uchar *  d_mtfIn,
uchar *  d_mtfOut,
uchar *  d_lists,
ushort *  d_list_sizes,
uint  nLists,
uint  offset,
uint  numElements 
)

Compute final MTF lists and final MTF output.

Parameters
[in]d_mtfInA char array of the input data stream to perform the MTF on.
[out]d_mtfOutA char array of the output with the transformed MTF string.
[in,out]d_listsA pointer to the start of MTF lists.
[in]d_list_sizesAn array storing the size of each MTF list.
[in]nListsTotal number of MTF lists.
[in]offsetThe offset during the reduction stage.
[in]numElementsTotal number of elements to perform the MTF on.
__global__ void huffman_build_histogram_kernel ( uint *  d_input,
uint *  d_histograms,
uint  numElements 
)

Compute 256-entry histogram.

Parameters
[in]d_inputAn array of words we will use to build our histogram.
[out]d_histogramsA pointer where we store our global histograms.
[in]numElementsThe total number of elements to build our histogram from.
__global__ void huffman_build_tree_kernel ( const uchar *  d_input,
uchar *  d_huffCodesPacked,
uint *  d_huffCodeLocations,
uchar *  d_huffCodeLengths,
uint *  d_histograms,
uint *  d_histogram,
uint *  d_nCodesPacked,
uint *  d_totalEncodedSize,
uint  histBlocks,
uint  numElements 
)

Build Huffman tree/codes.

Parameters
[in]d_inputAn array of input elements to encode
[out]d_huffCodesPackedAn array of huffman bit codes packed together
[out]d_huffCodeLocationsAn array which stores the starting bit locations of each Huffman bit code
[out]d_huffCodeLengthsAn array which stores the lengths of each Huffman bit code
[in]d_histogramsAn input array of histograms to combine
[out]d_histogramFinal histogram combined
[out]d_nCodesPackedNumber of chars it took to store all Huffman bit codes
[out]d_totalEncodedSizeTotal number of words it takes to hold the compressed data
[in]histBlocksTotal number of histograms we will combine into one
[in]numElementsNumber of elements to compress
__global__ void huffman_kernel_en ( uchar4 *  d_input,
uchar *  d_codes,
uint *  d_code_locations,
uchar *  d_huffCodeLengths,
encoded *  d_encoded,
uint  nCodesPacked,
uint  nThreads 
)

Perform parallel Huffman encoding.

Parameters
[in]d_inputInput array to encode
[in]d_codesArray of packed Huffman bit codes
[in]d_code_locationsArray of starting Huffman bit locations
[in]d_huffCodeLengthsAn array storing the bit lengths of the Huffman codes
[out]d_encodedAn array of encoded classes which stores the size and data of encoded data
[in]nCodesPackedNumber of chars it took to store all Huffman bit codes
[in]nThreadsTotal number of dispatched threads
__global__ void huffman_datapack_kernel ( encoded *  d_encoded,
uint *  d_encodedData,
uint *  d_totalEncodedSize,
uint *  d_eOffsets 
)

Pack together encoded blocks.

Parameters
[in]d_encodedAn array of encoded objects with stored size and data of the encoded data.
[out]d_encodedDataAn in array to store all encoded data.
[out]d_totalEncodedSizeTotal number words of the encoded data.
[out]d_eOffsetsArray holding the word offsets of each encoded data block.
template<typename T >
__global__ void list_rank_kernel_soa_1 ( T *  d_ranked_values,
const T *  d_unranked_values,
const int *  d_ping,
int *  d_pong,
int *  d_start_indices,
int  step,
int  head,
int  numElts 
)

Use pointer jumping to rank values. After ranking the values, calculate the next set of indices. The number of values ranked doubles at each kernel call. Called by listRank().

Parameters
[out]d_ranked_valuesRanked values array
[in]d_unranked_valuesUnranked values array
[in]d_pingNext indices array for the current kernel call
[in]d_pongNext indices array for the next kernel call
[in]d_start_indicesHolds the starting node indices for "ranking" threads. The number of "ranking" threads doubles at each stage.
[in]stepThe number of "ranking" threads.
[in]headHead node index of the linked-list.
[in]numEltsNumber of nodes to rank
template<typename T >
__global__ void list_rank_kernel_soa_2 ( T *  d_ranked_values,
const T *  d_unranked_values,
const int *  d_pong,
const int *  d_start_indices,
int  head,
int  numElts 
)

After pointer jumping is finished and all threads are able to rank values, ranking continues serially. Each thread ranks values until all values are ranked. Called by listRank().

Parameters
[out]d_ranked_valuesRanked values array
[in]d_unranked_valuesUnranked values array
[in]d_pongNext indices array for the current kernel call
[in]d_start_indicesHolds the starting node indices for "ranking" threads. The number of "ranking" threads doubles at each stage.
[in]headHead node index of the linked-list.
[in]numEltsNumber of nodes to rank
template<class T >
__global__ void simpleCopy ( T *  A_keys_dev,
unsigned int *  A_vals_dev,
T *  A_keys_out_dev,
unsigned int *  A_vals_out_dev,
int  offset,
int  numElementsToCopy 
)

Copies unused portions of arrays in our ping-pong strategy.

Parameters
[in]A_keys_dev,A_vals_devThe keys and values we will be copying
[out]A_keys_out_dev,A_vals_out_devThe keys and values array we will copy to
[in]offsetThe offset we are starting to copy from
[in]numElementsToCopyThe number of elements we copy starting from the offset
[in]A_keys_devThe keys we will be copying
[in]A_vals_devThe values we will be copying
[out]A_keys_out_devThe destination keys array
[out]A_vals_out_devThe destination values array
[in]offsetThe offset we are starting to copy from
[in]numElementsToCopyThe number of elements we copy starting from the offset
template<class T , int depth>
__global__ void blockWiseSort ( T *  A_keys,
unsigned int *  A_values,
int  blockSize,
size_t  totalSize 
)

Sorts blocks of data of size blockSize.

Parameters
[in,out]A_keyskeys to be sorted
[in,out]A_valuesassociated values to keys
[in]blockSizeSize of the chunks being sorted
[in]totalSizeSize of the enitre array
template<class T , int depth>
__global__ void simpleMerge_lower ( T *  A_keys,
unsigned int *  A_values,
T *  A_keys_out,
unsigned int *  A_values_out,
int  sizePerPartition,
int  size 
)

Merges the indices for the "lower" block (left block)

Utilizes a "ping-pong" strategy

Parameters
[in]A_keysGlobal array of keys to be merged
[in]A_valuesGlobal array of values to be merged
[out]A_keys_outResulting array of keys merged
[out]A_values_outResulting array of values merged
[in]sizePerPartitionSize of each partition being merged
[in]sizeSize of total Array being sorted
template<class T , int depth>
__global__ void simpleMerge_higher ( T *  A_keys,
unsigned int *  A_values,
T *  A_keys_out,
unsigned int *  A_values_out,
int  sizePerPartition,
int  size 
)

Merges the indices for the "upper" block (right block)

Utilizes a "ping-pong" strategy

Parameters
[in]A_keysGlobal array of keys to be merged
[in]A_valuesGlobal array of values to be merged
[out]A_keys_outResulting array of keys merged
[out]A_values_outResulting array of values merged
[in]sizePerPartitionSize of each partition being merged
[in]sizeSize of total Array being sorted
template<class T >
__global__ void findMultiPartitions ( T *  A,
int  splitsPP,
int  numPartitions,
int  partitionSize,
int *  partitionBeginA,
int *  partitionSizesA,
int  sizeA 
)

Merges the indices for the "upper" block (right block)

Utilizes a "ping-pong" strategy

Parameters
[in]AGlobal array of keys
[in]splitsPPGlobal array of values to be merged
[in]numPartitionsnumber of partitions being considered
[in]partitionSizeSize of each partition being considered
[out]partitionBeginAWhere each partition/subpartition will begin in A
[out]partitionSizesASize of each partition/subpartition in A
[in]sizeASize of the entire array
template<class T , int depth>
__global__ void mergeMulti_lower ( T *  A_keys_out,
unsigned int *  A_vals_out,
T *  A_keys,
unsigned int *  A_vals,
int  subPartitions,
int  numBlocks,
int *  partitionBeginA,
int *  partitionSizeA,
int  entirePartitionSize,
int  sizeA 
)

Blocks cooperatively Merge two partitions for the indices in the "lower" block (left block)

Utilizes a "ping-pong" strategy

Parameters
[out]A_keys_outResulting array of keys merged
[out]A_vals_outResulting array of values merged
[in]A_keysGlobal array of keys to be merged
[in]A_valsGlobal array of values to be merged
[in]subPartitionsNumber of blocks working on a partition (number of sub-partitions)
[in]numBlocks
[in]partitionBeginAPartition starting points decided by function findMultiPartitions
[in]partitionSizeAPartition sizes decided by function findMultiPartitions
[in]entirePartitionSizeThe size of an entire partition (before it is split up)
[in]sizeAThe total size of our array
template<class T , int depth>
__global__ void mergeMulti_higher ( T *  A_keys_out,
unsigned int *  A_vals_out,
T *  A_keys,
unsigned int *  A_vals,
int  subPartitions,
int  numBlocks,
int *  partitionBeginA,
int *  partitionSizeA,
int  entirePartitionSize,
int  sizeA 
)

Blocks cooperatively Merge two partitions for the indices in the "upper" block (right block)

Utilizes a "ping-pong" strategy

Parameters
[out]A_keys_outResulting array of keys merged
[out]A_vals_outResulting array of values merged
[in]A_keysGlobal array of keys to be merged
[in]A_valsGlobal array of values to be merged
[in]subPartitionsNumber of blocks working on a partition (number of sub-partitions)
[in]numBlocks
[in]partitionBeginAPartition starting points decided by function findMultiPartitions
[in]partitionSizeAPartition sizes decided by function findMultiPartitions
[in]entirePartitionSizeThe size of an entire partition (before it is split up)
[in]sizeAThe total size of our array
__global__ void flipFloats ( uint *  values,
uint  numValues 
)

Does special binary arithmetic before sorting floats.

Uses floatFlip function to flip bits.

Parameters
[in,out]valuesValues to be manipulated
[in]numValuesNumber of values to be flipped
__global__ void unflipFloats ( uint *  values,
uint  numValues 
)

Undoes the flips from flipFloats.

Uses floatUnflip function to unflip bits.

Parameters
[in,out]valuesValues to be manipulated
[in]numValuesNumber of values to be unflipped
template<bool flip>
__global__ void radixSortSingleWarp ( uint *  keys,
uint *  values,
uint  numElements 
)

Optimization for sorts of WARP_SIZE or fewer elements.

Parameters
[in,out]keysKeys to be sorted.
[in,out]valuesAssociated values to be sorted (through keys).
[in]numElementsNumber of elements in the sort.
template<bool flip>
__global__ void radixSortSingleWarpKeysOnly ( uint *  keys,
uint  numElements 
)

Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.

Parameters
[in,out]keysKeys to be sorted
[in]numElementsTotal number of elements to be sorted
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void radixSortBlocks ( uint4 *  keysOut,
uint4 *  valuesOut,
uint4 *  keysIn,
uint4 *  valuesIn,
uint  numElements,
uint  totalBlocks 
)

sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements

The radix sort is done in two stages. This stage calls radixSortBlock on each block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "flip" is used to only compile in the float flip code when float keys are used. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters
[out]keysOutOutput of sorted keys
[out]valuesOutOutput of associated values
[in]keysInInput of unsorted keys in GPU
[in]valuesInInput of associated input values
[in]numElementsTotal number of elements to sort
[in]totalBlocksThe number of blocks of data to sort
template<uint startbit, bool fullBlocks, bool loop>
__global__ void findRadixOffsets ( uint2 *  keys,
uint *  counters,
uint *  blockOffsets,
uint  numElements,
uint  totalBlocks 
)

Computes the number of keys of each radix in each block stores offset.

Given an array with blocks sorted according to a 4-bit radix group, each block counts the number of keys that fall into each radix in the group, and finds the starting offset of each radix in the block. It then writes the radix counts to the counters array, and the starting offsets to the blockOffsets array.

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters
[in]keysInput keys
[out]countersRadix count for each block
[out]blockOffsetsThe offset address for each block
[in]numElementsTotal number of elements
[in]totalBlocksTotal number of blocks
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void reorderData ( uint *  outKeys,
uint *  outValues,
uint2 *  keys,
uint2 *  values,
uint *  blockOffsets,
uint *  offsets,
uint *  sizes,
uint  numElements,
uint  totalBlocks 
)

Reorders data in the global array.

reorderData shuffles data in the array globally after the radix offsets have been found. On compute version 1.1 and earlier GPUs, this code depends on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).

On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures that all writes are coalesced using extra work in the kernel. On later GPUs coalescing rules have been relaxed, so this extra overhead hurts performance. On these GPUs we set manualCoalesce=false and directly store the results.

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters
[out]outKeysOutput of sorted keys
[out]outValuesOutput of associated values
[in]keysInput of unsorted keys in GPU
[in]valuesInput of associated input values
[in]blockOffsetsThe offset address for each block
[in]offsetsAddress of each radix within each block
[in]sizesNumber of elements in a block
[in]numElementsTotal number of elements
[in]totalBlocksTotal number of data blocks to process
Todo:
Args that are const below should be prototyped as const
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
__global__ void radixSortBlocksKeysOnly ( uint4 *  keysOut,
uint4 *  keysIn,
uint  numElements,
uint  totalBlocks 
)

Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements.

The radix sort is done in two stages. This stage calls radixSortBlock on each block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "flip" is used to only compile in the float flip code when float keys are used. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters
[out]keysOutOutput of sorted keys GPU main memory
[in]keysInInput of unsorted keys in GPU main memory
[in]numElementsTotal number of elements to sort
[in]totalBlocksTotal number of blocks to sort
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
__global__ void reorderDataKeysOnly ( uint *  outKeys,
uint2 *  keys,
uint *  blockOffsets,
uint *  offsets,
uint *  sizes,
uint  numElements,
uint  totalBlocks 
)

Reorders data in the global array.

reorderDataKeysOnly shuffles data in the array globally after the radix offsets have been found. On compute version 1.1 and earlier GPUs, this code depends on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).

On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures that all writes are coalesced using extra work in the kernel. On later GPUs coalescing rules have been relaxed, so this extra overhead hurts performance. On these GPUs we set manualCoalesce=false and directly store the results.

Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.

By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.

Parameters
[out]outKeysOutput result of reorderDataKeysOnly()
[in]keysKeys to be reordered
[in]blockOffsetsStart offset for each block
[in]offsetsOffset of each radix within each block
[in]sizesNumber of elements in a block
[in]numElementsTotal number of elements
[in]totalBlocksTotal number of blocks
__global__ void gen_randMD5 ( uint4 *  d_out,
size_t  numElements,
unsigned int  seed 
)

The main MD5 generation algorithm.

This function runs the MD5 hashing random number generator. It generates MD5 hashes, and uses the output as randomized bits. To repeatedly call this function, always call cudppRandSeed() first to set a new seed or else the output may be the same due to the deterministic nature of hashes. gen_randMD5 generates 128 random bits per thread. Therefore, the parameter d_out is expected to be an array of type uint4 with numElements indicies.

Parameters
[out]d_outthe output array of type uint4.
[in]numElementsthe number of elements in d_out
[in]seedthe random seed used to vary the output
See also
launchRandMD5Kernel()
template<typename T , class Oper , unsigned int blockSize, bool nIsPow2>
__global__ void reduce ( T *  odata,
const T *  idata,
unsigned int  n 
)

Main reduction kernel.

This reduction kernel adds multiple elements per thread sequentially, and then the threads work together to produce a block sum in shared memory. The code is optimized using warp-synchronous programming to eliminate unnecessary barrier synchronization. Performing sequential work in each thread before performing the log(N) parallel summation reduces the overall cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). (Brent's Theorem optimization)

Parameters
[out]odataThe output data pointer. Each block writes a single output element.
[in]idataThe input data pointer.
[in]nThe number of elements to be reduced.
__global__ void strConstruct ( uchar *  d_str,
uint *  d_str_value,
size_t  str_length 
)

Construct the input array.

This is the first stage in the SA. This stage construct the input array composed of values of the input char array followed by three 0s.

Parameters
[in]d_strInput char array to perform the SA on.
[out]d_str_valueOutput unsigned int array prepared for SA.
[in]str_lengthThe number of elements we are performing the SA on.
__global__ void resultConstruct ( uint *  d_keys_sa,
size_t  str_length 
)

Reconstruct the output.

This is the final stage in the SA. This stage reconstruct the output array by reducing each value by one.

Parameters
[in,out]d_keys_saFinal output of the suffix array which stores the positions of sorted suffixes.
[in]str_lengthSize of the array.
__global__ void sa12_keys_construct ( uint *  d_str,
uint *  d_keys_uint_12,
uint *  d_keys_srt_12,
int  mod_1,
size_t  tThreads 
)

Initialize the SA12 triplets.

Parameters
[in]d_strInitial array of character values.
[out]d_keys_uint_12The keys of righ-most char in SA12 triplets.
[out]d_keys_srt_12SA12 triplets positions.
[in]mod_1The number of elements whose positions mod3 = 1 (SA1)
[in]tThreadsThe number of elements whose positions mod3 = 1,2 (SA12)
__global__ void sa12_keys_construct_0 ( uint *  d_str,
uint *  d_keys_uint_12,
uint *  d_keys_srt_12,
size_t  tThreads 
)

Construct SA12 for the second radix sort.

Parameters
[in]d_strInitial array of character values.
[out]d_keys_uint_12The keys of second char in SA12 triplets.
[in]d_keys_srt_12SA12 triplets positions.
[in]tThreadsThe number of elements in SA12.
__global__ void sa12_keys_construct_1 ( uint *  d_str,
uint *  d_keys_uint_12,
uint *  d_keys_srt_12,
size_t  tThreads 
)

Construct SA12 for the third radix sort.

Parameters
[in]d_strInitial array of character values.
[out]d_keys_uint_12The keys of third char in SA12 triplets.
[in]d_keys_srt_12SA12 triplets positions.
[in]tThreadsThe number of elements in SA12.
__global__ void compute_rank ( uint *  d_str,
uint *  d_keys_srt_12,
uint *  d_flag,
bool *  result,
size_t  tThreads,
int  str_length 
)

Turn on flags for sorted SA12 triplets.

Parameters
[in]d_strInitial array of character values.
[in]d_keys_srt_12SA12 triplets positions.
[out]d_flagMarking the sorted triplets.
[out]result0 if SA12 is not fully sorted.
[in]tThreadsThe number of elements in SA12.
[in]str_lengthThe number of elements in original string.
__global__ void new_str_construct ( uint *  d_new_str,
uint *  d_keys_srt_12,
uint *  d_rank,
int  mod_1,
size_t  tThreads 
)

Construct new array for recursion.

Parameters
[out]d_new_strThe new string to be sent to recursion.
[in]d_keys_srt_12SA12 triplets positions.
[in]d_rankRanks of SA12 from compute_rank kernel.
[in]mod_1The number of elements of SA1.
[in]tThreadsThe number of elements of SA12.
__global__ void reconstruct ( uint *  d_keys_srt_12,
uint *  d_isa_12,
uint *  d_flag,
int  mod_1,
size_t  tThreads 
)

Translate SA12 from recursion.

Parameters
[in,out]d_keys_srt_12Sorted SA12.
[in]d_isa_12ISA12.
[in]d_flagFlags to mark SA1.
[in]mod_1The number of elements in SA1.
[in]tThreadsThe number of elements in SA12.
__global__ void isa12_construct ( uint *  d_keys_srt_12,
uint *  d_isa_12,
uint *  d_flag,
int  mod_1,
size_t  tThreads 
)

Construct ISA12.

Parameters
[in]d_keys_srt_12Fully sorted SA12 in global position.
[out]d_isa_12ISA12 to store the ranks in local position.
[out]d_flagFlags to mark SA1.
[in]mod_1The number of elements in SA1.
[in]tThreadsThe number of elements in SA12.
__global__ void sa3_srt_construct ( uint *  d_keys_srt_3,
uint *  d_str,
uint *  d_keys_srt_12,
uint *  d_keys_sa,
size_t  tThreads1,
size_t  tThreads2,
int  str_length 
)

Contruct SA3 triplets positions.

Parameters
[out]d_keys_srt_3SA3 generated from SA1.
[in]d_strOriginal input array.
[in]d_keys_srt_12Fully sorted SA12.
[in]d_keys_saPositions of SA1.
[in]tThreads1The number of elements of SA12.
[in]tThreads2The number of elements of SA3.
[in]str_lengthThe number of elements in original string.
__global__ void sa3_keys_construct ( uint *  d_keys_srt_3,
uint *  d_keys_sa,
uint *  d_str,
size_t  tThreads,
int  str_length 
)

Construct SA3 triplets keys.

Parameters
[in]d_keys_srt_3SA3 triplets positions.
[out]d_keys_saSA3 keys.
[in]d_strOriginal input string.
[in]tThreadsThe number of elements in SA12.
[in]str_lengthThe number of elements in original string.
__global__ void merge_akeys_construct ( uint *  d_str,
uint *  d_keys_srt_12,
uint *  d_isa_12,
Vector *  d_aKeys,
size_t  tThreads,
int  mod_1,
int  bound,
int  str_length 
)

Construct SA12 keys in terms of Vector.

Parameters
[in]d_strOriginal input data stream
[in]d_keys_srt_12The order of aKeys.
[in]d_isa_12The ranks in SA12 orders.
[out]d_aKeysSA12 keys in Vectors.
[in]tThreadsThe number elements in SA12
[in]mod_1The number of elements in SA1.
[in]boundThe number of elements in SA12 plus SA3.
[in]str_lengthThe number of elements in original string.
__global__ void merge_bkeys_construct ( uint *  d_str,
uint *  d_keys_srt_3,
uint *  d_isa_12,
Vector *  d_bKeys,
size_t  tThreads,
int  mod_1,
int  bound,
int  str_length 
)

Construct SA3 keys in Vector.

Parameters
[in]d_strOriginal input data stream.
[in]d_keys_srt_3The order of bKeys
[in]d_isa_12ISA12.
[out]d_bKeysSA3 keys in Vectors.
[in]tThreadsThe number of total threads.
[in]mod_1The number of elements in SA1.
[in]boundThe number of elements in SA12 and SA3.
[in]str_lengthThe number of elements in original str.
template<class T , class traits >
__global__ void scan4 ( T *  d_out,
const T *  d_in,
T *  d_blockSums,
int  numElements,
unsigned int  dataRowPitch,
unsigned int  blockSumRowPitch 
)

Main scan kernel.

This global device function performs one level of a multiblock scan on an arbitrary-dimensioned array in d_in, returning the result in d_out (which may point to the same array). The same function may be used for single or multi-row scans. To perform a multirow scan, pass the width of each row of the input row (in elements) in dataRowPitch, and the width of the rows of d_blockSums (in elements) in blockSumRowPitch, and invoke with a thread block grid with height greater than 1.

This function peforms one level of a recursive, multiblock scan. At the app level, this function is called by cudppScan and cudppMultiScan and used in combination with vectorAddUniform4() to produce a complete scan.

Template parameter T is the datatype of the array to be scanned. Template parameter traits is the ScanTraits struct containing compile-time options for the scan, such as whether it is forward or backward, exclusive or inclusive, multi- or single-row, etc.

Parameters
[out]d_outThe output (scanned) array
[in]d_inThe input array to be scanned
[out]d_blockSumsThe array of per-block sums
[in]numElementsThe number of elements to scan
[in]dataRowPitchThe width of each row of d_in in elements (for multi-row scans)
[in]blockSumRowPitchThe with of each row of d_blockSums in elements (for multi-row scans)
template<class T , class traits >
__global__ void segmentedScan4 ( T *  d_odata,
const T *  d_idata,
const unsigned int *  d_iflags,
unsigned int  numElements,
T *  d_blockSums = 0,
unsigned int *  d_blockFlags = 0,
unsigned int *  d_blockIndices = 0 
)

Main segmented scan kernel.

This global device function performs one level of a multiblock segmented scan on an one-dimensioned array in d_idata, returning the result in d_odata (which may point to the same array).

This function performs one level of a recursive, multiblock scan. At the app level, this function is called by cudppSegmentedScan and used in combination with either vectorSegmentedAddUniform4() (forward) or vectorSegmentedAddUniformToRight4() (backward) to produce a complete segmented scan.

Template parameter T is the datatype of the array to be scanned. Template parameter traits is the SegmentedScanTraits struct containing compile-time options for the segmented scan, such as whether it is forward or backward, inclusive or exclusive, etc.

Parameters
[out]d_odataThe output (scanned) array
[in]d_idataThe input array to be scanned
[in]d_iflagsThe input array of flags
[out]d_blockSumsThe array of per-block sums
[out]d_blockFlagsThe array of per-block OR-reduction of flags
[out]d_blockIndicesThe array of per-block min-reduction of indices
[in]numElementsThe number of elements to scan
template<class T , bool isFullBlock>
__global__ void sparseMatrixVectorFetchAndMultiply ( unsigned int *  d_flags,
T *  d_prod,
const T *  d_A,
const T *  d_x,
const unsigned int *  d_indx,
unsigned int  numNZElts 
)

Fetch and multiply kernel.

This global device function takes an element from the vector d_A, finds its column in d_indx and multiplies the element from d_A with its corresponding (that is having the same row) element in d_x and stores the resulting product in d_prod. It also sets all the elements of d_flags to 0.

Template parameter T is the datatype of the matrix A and x.

Parameters
[out]d_flagsThe output flags array
[out]d_prodThe output products array
[in]d_AThe input matrix A
[in]d_xThe input array x
[in]d_indxThe input array of column indices for each element in A
[in]numNZEltsThe number of non-zero elements in matrix A
__global__ void sparseMatrixVectorSetFlags ( unsigned int *  d_flags,
const unsigned int *  d_rowindx,
unsigned int  numRows 
)

Set Flags kernel.

This global device function takes an element from the vector d_rowindx, and sets the corresponding position in d_flags to 1

Parameters
[out]d_flagsThe output flags array
[in]d_rowindxThe starting index of each row in the "flattened" version of matrix A
[in]numRowsThe number of rows in matrix A
template<class T >
__global__ void yGather ( T *  d_y,
const T *  d_prod,
const unsigned int *  d_rowFindx,
unsigned int  numRows 
)

Gather final y values kernel.

This global device function takes an element from the vector d_rowFindx, which for each row gives the index of the last element of that row, reads the corresponding position in d_prod and write it in d_y

Template parameter T is the datatype of the matrix A and x.

Parameters
[out]d_yThe output result array
[in]d_prodThe input products array (which now contains sums for each row)
[in]d_rowFindxThe starting index of each row in the "flattened" version of matrix A
[in]numRowsThe number of rows in matrix A
__global__ void alignedOffsets ( unsigned int *  numSpaces,
unsigned int *  d_address,
unsigned char *  d_stringVals,
unsigned char  termC,
unsigned int  numElements,
unsigned int  stringSize 
)

Calculate the number of spaces required for each string to align the string array.

Parameters
[out]numSpacesNumber of spaces required for each string
[in]d_addressInput addresses of each string
[in]d_stringValsString array
[in]termCTermination character for the strings
[in]numElementsNumber of strings
[in]stringSizeNumber of characters in the string array
__global__ void alignString ( unsigned int *  packedStrings,
unsigned char *  d_stringVals,
unsigned int *  packedAddress,
unsigned int *  address,
unsigned int  numElements,
unsigned int  stringArrayLength,
unsigned char  termC 
)

Packs strings into unsigned ints to be sorted later. These packed strings will also be aligned.

Parameters
[out]packedStringsResulting packed strings.
[in]d_stringValsUnpacked string array which we will pack
[out]packedAddressResulting addresses for each string to the packedStrings array
[in]addressInput addresses of unpacked strings
[in]numElementsNumber of strings
[in]stringArrayLengthNumber of characters in the string array
[in]termCTermination character for the strings
__global__ void createKeys ( unsigned int *  d_keys,
unsigned int *  packedStrings,
unsigned int *  packedAddress,
unsigned int  numElements 
)

Create keys (first four characters stuffed in an uint) from the addresses to the strings, and the string array.

Parameters
[out]d_keysResulting keys
[in]packedStringsPacked string array.
[in]packedAddressAddresses which point to the string array.
[in]numElementsNumber of strings
__global__ void unpackAddresses ( unsigned int *  packedAddress,
unsigned int *  packedAddressRef,
unsigned int *  address,
unsigned int *  addressRef,
size_t  numElements 
)

Converts addresses from packed (unaligned) form to unpacked and unaligned form Resulting aligned strings begin in our string array packed in an unsigned int and aligned such that each string begins at the start of a uint (divisible by 4)

Parameters
[in]packedAddressResulting packed addresses that have been sorted. All strings are aligned.
[in]packedAddressRefOriginal array after packing (before sort). Used as a reference.
[out]addressFinal output of sorted addresses in unpacked form.
[in]addressRefReference array of original unpacked addresses.
[in]numElementsNumber of strings
template<class T , int depth>
__global__ void blockWiseStringSort ( T *  A_keys,
T *  A_address,
T *  stringVals,
int  blockSize,
int  totalSize,
unsigned int  stringSize,
unsigned char  termC 
)

Does an initial blockSort based on the size of our partition (limited by shared memory size)

Parameters
[in,out]A_keys,A_addressThis sort is in-place. A_keys and A_address store the key (first four characters) and addresses of our strings
[in]stringValsGlobal array of strings for tie breaks
[in]blockSizesize of each block
[in]totalSizeThe total size of the array we are sorting
[in]stringSizeThe size of our string array (stringVals)
[in]termCTermination character for the strings
template<class T , int depth>
__global__ void simpleStringMerge ( T *  A_keys,
T *  A_keys_out,
T *  A_values,
T *  A_values_out,
T *  stringValues,
int  sizePerPartition,
int  size,
int  step,
int  stringSize,
unsigned char  termC 
)

Merges two independent sets. Each CUDA block works on two partitions of data without cooperating.

Parameters
[in]A_keysFirst four characters (input) of our sets to merge
[in]A_valuesAddresses of the strings (for tie breaks)
[in]stringValuesGlobal string array for tie breaks
[out]A_keys_out,A_values_outKeys and values array after merge step
[in]sizePerPartitionThe size of each partition for this merge step
[in]sizeGlobal size of our array
[in]stepNumber of merges done so far
[in]stringSizeglobal string length
[in]termCTermination character for the strings
template<class T >
__global__ void findMultiPartitions ( T *  A_keys,
T *  A_address,
T *  stringValues,
int  splitsPP,
int  numPartitions,
int  partitionSize,
unsigned int *  partitionBeginA,
unsigned int *  partitionSizesA,
unsigned int *  partitionBeginB,
unsigned int *  partitionSizesB,
size_t  size,
size_t  stringSize,
unsigned char  termC 
)

For our multiMerge kernels we need to divide our partitions into smaller partitions. This kernel breaks up a set of partitions into splitsPP*numPartitions subpartitions.

Parameters
[in]A_keys,A_addressFirst four characters (input), and addresses of our inputs
[in]stringValuesGlobal string array for tie breaks
[in]splitsPP,numPartitions,partitionSizePartition information for this routine (splitsPP=splits Per Partition)
[in]partitionBeginA,partitionSizesAPartition starting points and sizes for each new subpartition in our original set in A
[in]partitionBeginB,partitionSizesBPartition starting points and sizes for each new subpartition in our original set in B
[in]size,stringSizeNumber of elements in our set, and size of our global string array
[in]termCTermination character for the strings
template<class T , int depth>
__global__ void stringMergeMulti ( T *  A_keys,
T *  A_keys_out,
T *  A_values,
T *  A_values_out,
T *  stringValues,
int  subPartitions,
int  numBlocks,
unsigned int *  partitionBeginA,
unsigned int *  partitionSizeA,
unsigned int *  partitionBeginB,
unsigned int *  partitionSizeB,
int  entirePartitionSize,
int  step,
size_t  size,
size_t  stringSize,
unsigned char  termC 
)

Main merge kernel where multiple CUDA blocks cooperate to merge a partition(s)

Parameters
[in]A_keys,A_valuesFirst four characters (input), and addresses of our inputs
[out]A_keys_out,A_values_outFirst four characters, and addresses for our outputs(ping-pong)
[in]stringValuesstring array for tie breaks
[out]subPartitions,numBlocksNumber of splits per partitions and number of partitions respectively
[in]partitionBeginA,partitionSizeAWhere partitions begin and how large they are for Segment A
[in]partitionBeginB,partitionSizeBWhere partitions begin and how large they are for Segment B
[in]entirePartitionSizeThe maximum length of a partition
[in]stepNumber of merge cycles done
[in]sizeNumber of total strings being sorted
[in]stringSizeLength of string array
[in]termCTermination character for the strings
template<class T >
__global__ void crpcrKernel ( T *  d_a,
T *  d_b,
T *  d_c,
T *  d_d,
T *  d_x,
unsigned int  systemSizeOriginal,
unsigned int  iterations 
)

Hybrid CR-PCR Tridiagonal linear system solver (CRPCR)

This kernel solves a tridiagonal linear system using a hybrid CR-PCR algorithm. The solver first reduces the system size using cyclic reduction, then solves the intermediate system using parallel cyclic reduction to reduce shared memory bank conflicts and algorithmic steps, and finally switches back to cyclic reduction to solve all unknowns.

Parameters
[out]d_xSolution vector
[in]d_aLower diagonal
[in]d_bMain diagonal
[in]d_cUpper diagonal
[in]d_dRight hand side
[in]systemSizeOriginalThe size of each system
[in]iterationsThe computed number of PCR iterations
template<class T >
__global__ void vectorAddConstant ( T *  d_vector,
constant,
int  n,
int  baseIndex 
)

Adds a constant value to all values in the input d_vector.

Each thread adds two pairs of elements.

Todo:
Test this function – it is currently not yet used.
Parameters
[in,out]d_vectorThe array of elements to be modified
[in]constantThe constant value to be added to elements of d_vector
[in]nThe number of elements in the d_vector to be modified
[in]baseIndexAn optional offset to the beginning of the elements in the input array to be processed
template<class T >
__global__ void vectorAddUniform ( T *  d_vector,
const T *  d_uniforms,
int  numElements,
int  blockOffset,
int  baseIndex 
)

Add a uniform value to each data element of an array.

This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds two pairs of values.

Parameters
[out]d_vectorThe d_vector whose values will have the uniform added
[in]d_uniformsThe array of uniform values (one per CTA)
[in]numElementsThe number of elements in d_vector to process
[in]blockOffsetan optional offset to the beginning of this block's data.
[in]baseIndexan optional offset to the beginning of the array within d_vector.
template<class T , class Oper , int elementsPerThread, bool fullBlocks>
__global__ void vectorAddUniform4 ( T *  d_vector,
const T *  d_uniforms,
int  numElements,
int  vectorRowPitch,
int  uniformRowPitch,
int  blockOffset,
int  baseIndex 
)

Add a uniform value to each data element of an array (vec4 version)

This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds the uniform value to eight values in d_vector.

Parameters
[out]d_vectorThe d_vector whose values will have the uniform added
[in]d_uniformsThe array of uniform values (one per CTA)
[in]numElementsThe number of elements in d_vector to process
[in]vectorRowPitchFor 2D arrays, the pitch (in elements) of the rows of d_vector.
[in]uniformRowPitchFor 2D arrays, the pitch (in elements) of the rows of d_uniforms.
[in]blockOffsetan optional offset to the beginning of this block's data.
[in]baseIndexan optional offset to the beginning of the array within d_vector.
template<class T >
__global__ void vectorAddVector ( T *  d_vectorA,
const T *  d_vectorB,
int  numElements,
int  baseIndex 
)

Adds together two vectors.

Each thread adds two pairs of elements.

Todo:
Test this function – it is currently not yet used.
Parameters
[out]d_vectorAThe left operand array and the result
[in]d_vectorBThe right operand array
[in]numElementsThe number of elements in the vectors to be added.
[in]baseIndexAn optional offset to the beginning of the elements in the input arrays to be processed
template<class T , class Oper , bool isLastBlockFull>
__global__ void vectorSegmentedAddUniform4 ( T *  d_vector,
const T *  d_uniforms,
const unsigned int *  d_maxIndices,
unsigned int  numElements,
int  blockOffset,
int  baseIndex 
)

Add a uniform value to data elements of an array (vec4 version)

This function reads one value per CTA from d_uniforms into shared memory and adds that value to values "owned" by the CTA in d_vector. The uniform value is added to only those values "owned" by the CTA which have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX it adds the uniform to all values "owned" by the CTA. Each thread adds the uniform value to eight values in d_vector.

Parameters
[out]d_vectorThe d_vector whose values will have the uniform added
[in]d_uniformsThe array of uniform values (one per CTA)
[in]d_maxIndicesThe array of maximum indices (one per CTA). This is index upto which the uniform would be added. If this is UINT_MAX the uniform is added to all elements of the CTA. This index is 1-based.
[in]numElementsThe number of elements in d_vector to process
[in]blockOffsetan optional offset to the beginning of this block's data.
[in]baseIndexan optional offset to the beginning of the array within d_vector.
template<class T , class Oper , bool isLastBlockFull>
__global__ void vectorSegmentedAddUniformToRight4 ( T *  d_vector,
const T *  d_uniforms,
const unsigned int *  d_minIndices,
unsigned int  numElements,
int  blockOffset,
int  baseIndex 
)

Add a uniform value to data elements of an array (vec4 version)

This function reads one value per CTA from d_uniforms into shared memory and adds that value to values "owned" by the CTA in d_vector. The uniform value is added to only those values "owned" by the CTA which have an index greater than d_minIndex. If d_minIndex for that CTA is 0 it adds the uniform to all values "owned" by the CTA. Each thread adds the uniform value to eight values in d_vector.

Parameters
[out]d_vectorThe d_vector whose values will have the uniform added
[in]d_uniformsThe array of uniform values (one per CTA)
[in]d_minIndicesThe array of minimum indices (one per CTA). The uniform is added to the right of this index (that is, to every index that is greater than this index). If this is 0, the uniform is added to all elements of the CTA. This index is 1-based to prevent overloading of what 0 means. In our case it means absence of a flag. But if the first element of a CTA has flag the index will also be 0. Hence we use 1-based indices so the index is 1 in the latter case.
[in]numElementsThe number of elements in d_vector to process
[in]blockOffsetan optional offset to the beginning of this block's data.
[in]baseIndexan optional offset to the beginning of the array within d_vector.