CUDPP
2.1
CUDA Data-Parallel Primitives Library
|
Functions | |
__global__ void | alignedOffsets (unsigned int *numSpaces, unsigned int *d_address, unsigned char *d_stringVals, unsigned char termC, unsigned int numElements, unsigned int stringSize) |
Calculate the number of spaces required for each string to align the string array. More... | |
__global__ void | alignString (unsigned int *packedStrings, unsigned char *d_stringVals, unsigned int *packedAddress, unsigned int *address, unsigned int numElements, unsigned int stringArrayLength, unsigned char termC) |
Packs strings into unsigned ints to be sorted later. These packed strings will also be aligned. More... | |
__global__ void | createKeys (unsigned int *d_keys, unsigned int *packedStrings, unsigned int *packedAddress, unsigned int numElements) |
Create keys (first four characters stuffed in an uint) from the addresses to the strings, and the string array. More... | |
__global__ void | unpackAddresses (unsigned int *packedAddress, unsigned int *packedAddressRef, unsigned int *address, unsigned int *addressRef, size_t numElements) |
Converts addresses from packed (unaligned) form to unpacked and unaligned form Resulting aligned strings begin in our string array packed in an unsigned int and aligned such that each string begins at the start of a uint (divisible by 4) More... | |
template<class T , int depth> | |
__global__ void | blockWiseStringSort (T *A_keys, T *A_address, T *stringVals, int blockSize, int totalSize, unsigned int stringSize, unsigned char termC) |
Does an initial blockSort based on the size of our partition (limited by shared memory size) More... | |
template<class T , int depth> | |
__global__ void | simpleStringMerge (T *A_keys, T *A_keys_out, T *A_values, T *A_values_out, T *stringValues, int sizePerPartition, int size, int step, int stringSize, unsigned char termC) |
Merges two independent sets. Each CUDA block works on two partitions of data without cooperating. More... | |
template<class T > | |
__global__ void | findMultiPartitions (T *A_keys, T *A_address, T *stringValues, int splitsPP, int numPartitions, int partitionSize, unsigned int *partitionBeginA, unsigned int *partitionSizesA, unsigned int *partitionBeginB, unsigned int *partitionSizesB, size_t size, size_t stringSize, unsigned char termC) |
For our multiMerge kernels we need to divide our partitions into smaller partitions. This kernel breaks up a set of partitions into splitsPP*numPartitions subpartitions. More... | |
template<class T , int depth> | |
__global__ void | stringMergeMulti (T *A_keys, T *A_keys_out, T *A_values, T *A_values_out, T *stringValues, int subPartitions, int numBlocks, unsigned int *partitionBeginA, unsigned int *partitionSizeA, unsigned int *partitionBeginB, unsigned int *partitionSizeB, int entirePartitionSize, int step, size_t size, size_t stringSize, unsigned char termC) |
Main merge kernel where multiple CUDA blocks cooperate to merge a partition(s) More... | |
Compact Functions | |
template<class T , bool isBackward> | |
__global__ void | compactData (T *d_out, size_t *d_numValidElements, const unsigned int *d_indices, const unsigned int *d_isValid, const T *d_in, unsigned int numElements) |
Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray(). More... | |
Compress Functions | |
typedef unsigned int | uint |
typedef unsigned char | uchar |
typedef unsigned short | ushort |
__global__ void | bwt_compute_final_kernel (const uchar *d_bwtIn, const uint *d_values, int *d_bwtIndex, uchar *d_bwtOut, uint numElements, uint tThreads) |
Compute final BWT. More... | |
template<class T , int depth> | |
__global__ void | stringMergeMulti (T *A_keys, T *A_keys_out, T *A_values, T *A_values_out, T *stringValues, int subPartitions, int numBlocks, int *partitionBeginA, int *partitionSizeA, int *partitionBeginB, int *partitionSizeB, int entirePartitionSize, size_t numElements) |
Multi merge. More... | |
template<class T > | |
__global__ void | findMultiPartitions (T *A, int splitsPP, int numPartitions, int partitionSize, int *partitionBeginA, int *partitionSizesA, int *partitionBeginB, int *partitionSizesB, int sizeA) |
Merges the indices for the "upper" block (right block) More... | |
template<class T , int depth> | |
__global__ void | simpleStringMerge (T *A_keys, T *A_keys_out, T *A_values, T *A_values_out, T *stringValues, int sizePerPartition, int size, T *stringValues2, size_t numElements) |
Simple merge. More... | |
template<class T , int depth> | |
__global__ void | blockWiseStringSort (T *A_keys, T *A_address, const T *stringVals, T *stringVals2, int blockSize, size_t numElements) |
Sorts blocks of data of size blockSize. More... | |
__global__ void | bwt_keys_construct_kernel (uchar4 *d_bwtIn, uint *d_bwtInRef, uint *d_keys, uint *d_values, uint *d_bwtInRef2, uint tThreads) |
Massage input to set up for merge sort. More... | |
__global__ void | mtf_reduction_kernel (const uchar *d_mtfIn, uchar *d_lists, ushort *d_list_sizes, uint nLists, uint offset, uint numElements) |
First stage in MTF (Reduction) More... | |
__global__ void | mtf_GLreduction_kernel (uchar *d_lists, ushort *d_list_sizes, uint offset, uint tThreads, uint nLists) |
Second stage in MTF (Global reduction) More... | |
__global__ void | mtf_GLdownsweep_kernel (uchar *d_lists, ushort *d_list_sizes, uint offset, uint lastLevel, uint nLists, uint tThreads) |
Third stage in MTF (Global downsweep) More... | |
__global__ void | mtf_localscan_lists_kernel (const uchar *d_mtfIn, uchar *d_mtfOut, uchar *d_lists, ushort *d_list_sizes, uint nLists, uint offset, uint numElements) |
Compute final MTF lists and final MTF output. More... | |
__global__ void | huffman_build_histogram_kernel (uint *d_input, uint *d_histograms, uint numElements) |
Compute 256-entry histogram. More... | |
__global__ void | huffman_build_tree_kernel (const uchar *d_input, uchar *d_huffCodesPacked, uint *d_huffCodeLocations, uchar *d_huffCodeLengths, uint *d_histograms, uint *d_histogram, uint *d_nCodesPacked, uint *d_totalEncodedSize, uint histBlocks, uint numElements) |
Build Huffman tree/codes. More... | |
__global__ void | huffman_kernel_en (uchar4 *d_input, uchar *d_codes, uint *d_code_locations, uchar *d_huffCodeLengths, encoded *d_encoded, uint nCodesPacked, uint nThreads) |
Perform parallel Huffman encoding. More... | |
__global__ void | huffman_datapack_kernel (encoded *d_encoded, uint *d_encodedData, uint *d_totalEncodedSize, uint *d_eOffsets) |
Pack together encoded blocks. More... | |
ListRank Functions | |
typedef unsigned int | uint |
typedef unsigned char | uchar |
typedef unsigned short | ushort |
template<typename T > | |
__global__ void | list_rank_kernel_soa_1 (T *d_ranked_values, const T *d_unranked_values, const int *d_ping, int *d_pong, int *d_start_indices, int step, int head, int numElts) |
Use pointer jumping to rank values. After ranking the values, calculate the next set of indices. The number of values ranked doubles at each kernel call. Called by listRank(). More... | |
template<typename T > | |
__global__ void | list_rank_kernel_soa_2 (T *d_ranked_values, const T *d_unranked_values, const int *d_pong, const int *d_start_indices, int head, int numElts) |
After pointer jumping is finished and all threads are able to rank values, ranking continues serially. Each thread ranks values until all values are ranked. Called by listRank(). More... | |
MergeSort Functions | |
typedef unsigned int | uint |
template<class T > | |
__global__ void | simpleCopy (T *A_keys_dev, unsigned int *A_vals_dev, T *A_keys_out_dev, unsigned int *A_vals_out_dev, int offset, int numElementsToCopy) |
Copies unused portions of arrays in our ping-pong strategy. More... | |
template<class T , int depth> | |
__global__ void | blockWiseSort (T *A_keys, unsigned int *A_values, int blockSize, size_t totalSize) |
Sorts blocks of data of size blockSize. More... | |
template<class T , int depth> | |
__global__ void | simpleMerge_lower (T *A_keys, unsigned int *A_values, T *A_keys_out, unsigned int *A_values_out, int sizePerPartition, int size) |
Merges the indices for the "lower" block (left block) More... | |
template<class T , int depth> | |
__global__ void | simpleMerge_higher (T *A_keys, unsigned int *A_values, T *A_keys_out, unsigned int *A_values_out, int sizePerPartition, int size) |
Merges the indices for the "upper" block (right block) More... | |
template<class T > | |
__global__ void | findMultiPartitions (T *A, int splitsPP, int numPartitions, int partitionSize, int *partitionBeginA, int *partitionSizesA, int sizeA) |
Merges the indices for the "upper" block (right block) More... | |
template<class T , int depth> | |
__global__ void | mergeMulti_lower (T *A_keys_out, unsigned int *A_vals_out, T *A_keys, unsigned int *A_vals, int subPartitions, int numBlocks, int *partitionBeginA, int *partitionSizeA, int entirePartitionSize, int sizeA) |
Blocks cooperatively Merge two partitions for the indices in the "lower" block (left block) More... | |
template<class T , int depth> | |
__global__ void | mergeMulti_higher (T *A_keys_out, unsigned int *A_vals_out, T *A_keys, unsigned int *A_vals, int subPartitions, int numBlocks, int *partitionBeginA, int *partitionSizeA, int entirePartitionSize, int sizeA) |
Blocks cooperatively Merge two partitions for the indices in the "upper" block (right block) More... | |
RadixSort Functions | |
typedef unsigned int | uint |
__global__ void | emptyKernel () |
And empty kernel used to reset CTA issue hardware. | |
__global__ void | flipFloats (uint *values, uint numValues) |
Does special binary arithmetic before sorting floats. More... | |
__global__ void | unflipFloats (uint *values, uint numValues) |
Undoes the flips from flipFloats. More... | |
template<bool flip> | |
__global__ void | radixSortSingleWarp (uint *keys, uint *values, uint numElements) |
Optimization for sorts of WARP_SIZE or fewer elements. More... | |
template<bool flip> | |
__global__ void | radixSortSingleWarpKeysOnly (uint *keys, uint numElements) |
Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version. More... | |
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop> | |
__global__ void | radixSortBlocks (uint4 *keysOut, uint4 *valuesOut, uint4 *keysIn, uint4 *valuesIn, uint numElements, uint totalBlocks) |
sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements More... | |
template<uint startbit, bool fullBlocks, bool loop> | |
__global__ void | findRadixOffsets (uint2 *keys, uint *counters, uint *blockOffsets, uint numElements, uint totalBlocks) |
Computes the number of keys of each radix in each block stores offset. More... | |
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop> | |
__global__ void | reorderData (uint *outKeys, uint *outValues, uint2 *keys, uint2 *values, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks) |
Reorders data in the global array. More... | |
template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop> | |
__global__ void | radixSortBlocksKeysOnly (uint4 *keysOut, uint4 *keysIn, uint numElements, uint totalBlocks) |
Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements. More... | |
template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop> | |
__global__ void | reorderDataKeysOnly (uint *outKeys, uint2 *keys, uint *blockOffsets, uint *offsets, uint *sizes, uint numElements, uint totalBlocks) |
Reorders data in the global array. More... | |
Rand Functions | |
__global__ void | gen_randMD5 (uint4 *d_out, size_t numElements, unsigned int seed) |
The main MD5 generation algorithm. More... | |
Reduce Functions | |
template<typename T , class Oper , unsigned int blockSize, bool nIsPow2> | |
__global__ void | reduce (T *odata, const T *idata, unsigned int n) |
Main reduction kernel. More... | |
Scan Functions | |
template<class T , class traits > | |
__global__ void | scan4 (T *d_out, const T *d_in, T *d_blockSums, int numElements, unsigned int dataRowPitch, unsigned int blockSumRowPitch) |
Main scan kernel. More... | |
Segmented scan Functions | |
template<class T , class traits > | |
__global__ void | segmentedScan4 (T *d_odata, const T *d_idata, const unsigned int *d_iflags, unsigned int numElements, T *d_blockSums=0, unsigned int *d_blockFlags=0, unsigned int *d_blockIndices=0) |
Main segmented scan kernel. More... | |
Sparse Matrix-Vector multiply Functions | |
template<class T , bool isFullBlock> | |
__global__ void | sparseMatrixVectorFetchAndMultiply (unsigned int *d_flags, T *d_prod, const T *d_A, const T *d_x, const unsigned int *d_indx, unsigned int numNZElts) |
Fetch and multiply kernel. More... | |
__global__ void | sparseMatrixVectorSetFlags (unsigned int *d_flags, const unsigned int *d_rowindx, unsigned int numRows) |
Set Flags kernel. More... | |
template<class T > | |
__global__ void | yGather (T *d_y, const T *d_prod, const unsigned int *d_rowFindx, unsigned int numRows) |
Gather final y values kernel. More... | |
Tridiagonal functions | |
template<class T > | |
__global__ void | crpcrKernel (T *d_a, T *d_b, T *d_c, T *d_d, T *d_x, unsigned int systemSizeOriginal, unsigned int iterations) |
Hybrid CR-PCR Tridiagonal linear system solver (CRPCR) More... | |
Vector Functions | |
template<class T > | |
__global__ void | vectorAddConstant (T *d_vector, T constant, int n, int baseIndex) |
Adds a constant value to all values in the input d_vector. More... | |
template<class T > | |
__global__ void | vectorAddUniform (T *d_vector, const T *d_uniforms, int numElements, int blockOffset, int baseIndex) |
Add a uniform value to each data element of an array. More... | |
template<typename T > | |
__global__ void | vectorAddUniform2 (T *g_data, T *uniforms, int n, int eltsPerBlock) |
template<class T , class Oper , int elementsPerThread, bool fullBlocks> | |
__global__ void | vectorAddUniform4 (T *d_vector, const T *d_uniforms, int numElements, int vectorRowPitch, int uniformRowPitch, int blockOffset, int baseIndex) |
Add a uniform value to each data element of an array (vec4 version) More... | |
template<class T > | |
__global__ void | vectorAddVector (T *d_vectorA, const T *d_vectorB, int numElements, int baseIndex) |
Adds together two vectors. More... | |
template<class T , class Oper , bool isLastBlockFull> | |
__global__ void | vectorSegmentedAddUniform4 (T *d_vector, const T *d_uniforms, const unsigned int *d_maxIndices, unsigned int numElements, int blockOffset, int baseIndex) |
Add a uniform value to data elements of an array (vec4 version) More... | |
template<class T , class Oper , bool isLastBlockFull> | |
__global__ void | vectorSegmentedAddUniformToRight4 (T *d_vector, const T *d_uniforms, const unsigned int *d_minIndices, unsigned int numElements, int blockOffset, int baseIndex) |
Add a uniform value to data elements of an array (vec4 version) More... | |
The CUDPP Kernel-Level API contains functions that run on the GPU device across a grid of Cooperative Thread Array (CTA, aka Thread Block). These kernels are declared global
so that they must be invoked from host (CPU) code. They generally invoke GPU device
routines in the CUDPP CTA-Level API. Kernel-Level API functions are used by CUDPP Application-Level functions to implement their functionality.
__global__ void compactData | ( | T * | d_out, |
size_t * | d_numValidElements, | ||
const unsigned int * | d_indices, | ||
const unsigned int * | d_isValid, | ||
const T * | d_in, | ||
unsigned int | numElements | ||
) |
Consolidate non-null elements - for each non-null element in d_in write it to d_out, in the position specified by d_isValid. Called by compactArray().
[out] | d_out | Output array of compacted values. |
[out] | d_numValidElements | The number of elements in d_in with valid flags set to 1. |
[in] | d_indices | Positions where non-null elements will go in d_out. |
[in] | d_isValid | Flags indicating valid (1) and invalid (0) elements. Only valid elements will be copied to d_out. |
[in] | d_in | The input array |
[in] | numElements | The length of the d_in in elements. |
__global__ void bwt_compute_final_kernel | ( | const uchar * | d_bwtIn, |
const uint * | d_values, | ||
int * | d_bwtIndex, | ||
uchar * | d_bwtOut, | ||
uint | numElements, | ||
uint | tThreads | ||
) |
Compute final BWT.
This is the final stage in the BWT. This stage computes the final values of the BWT output. It is given the indices of where each of the cyclical rotations of the initial input were sorted to. It uses these indices to figure out the last "column" of the sorted cyclical rotations which is the final BWT output.
[in] | d_bwtIn | Input char array to perform the BWT on. |
[in] | d_values | Input array that gives the indices of where each of the cyclical rotations of the intial input were sorted to. |
[out] | d_bwtIndex | Output pointer to store the BWT index. The index tells us where the original string sorted to. |
[out] | d_bwtOut | Output char array of the BWT. |
[in] | numElements | The number of elements we are performing a BWT on. |
[in] | tThreads | The total threads we have dispatched on the device. |
__global__ void stringMergeMulti | ( | T * | A_keys, |
T * | A_keys_out, | ||
T * | A_values, | ||
T * | A_values_out, | ||
T * | stringValues, | ||
int | subPartitions, | ||
int | numBlocks, | ||
int * | partitionBeginA, | ||
int * | partitionSizeA, | ||
int * | partitionBeginB, | ||
int * | partitionSizeB, | ||
int | entirePartitionSize, | ||
size_t | numElements | ||
) |
Multi merge.
[in] | A_keys | keys to be sorted |
[out] | A_keys_out | keys after being sorted |
[in] | A_values | associated values to keys |
[out] | A_values_out | associated values after sort |
[in] | stringValues | keys of each of the cyclical rotations |
[in] | subPartitions | Number of blocks working on a partition (number of sub-partitions) |
[in] | numBlocks | |
[out] | partitionBeginA | Where each partition/subpartition will begin in A |
[in] | partitionSizeA | Partition sizes decided by function findMultiPartitions |
[out] | partitionBeginB | Where each partition/subpartition will begin in B |
[in] | partitionSizeB | Partition sizes decided by function findMultiPartitions |
[in] | entirePartitionSize | The size of an entire partition (before it is split up) |
[in] | numElements | Size of the enitre array |
__global__ void findMultiPartitions | ( | T * | A, |
int | splitsPP, | ||
int | numPartitions, | ||
int | partitionSize, | ||
int * | partitionBeginA, | ||
int * | partitionSizesA, | ||
int * | partitionBeginB, | ||
int * | partitionSizesB, | ||
int | sizeA | ||
) |
Merges the indices for the "upper" block (right block)
Utilizes a "ping-pong" strategy
[in] | A | Global array of keys |
[in] | splitsPP | Global array of values to be merged |
[in] | numPartitions | number of partitions being considered |
[in] | partitionSize | Size of each partition being considered |
[out] | partitionBeginA | Where each partition/subpartition will begin in A |
[out] | partitionSizesA | Size of each partition/subpartition in A |
[out] | partitionBeginB | Where each partition/subpartition will begin in B |
[out] | partitionSizesB | Size of each partition/subpartition in B |
[in] | sizeA | Size of the entire array |
__global__ void simpleStringMerge | ( | T * | A_keys, |
T * | A_keys_out, | ||
T * | A_values, | ||
T * | A_values_out, | ||
T * | stringValues, | ||
int | sizePerPartition, | ||
int | size, | ||
T * | stringValues2, | ||
size_t | numElements | ||
) |
Simple merge.
[in] | A_keys | keys to be sorted |
[out] | A_keys_out | keys after being sorted |
[in] | A_values | associated values to keys |
[out] | A_values_out | associated values after sort |
[in] | stringValues | BWT string manipulated to words |
[in] | sizePerPartition | Size of each partition being merged |
[in] | size | Size of total Array being sorted |
[in] | stringValues2 | keys of each of the cyclical rotations |
[in] | numElements | Number of elements being sorted |
__global__ void blockWiseStringSort | ( | T * | A_keys, |
T * | A_address, | ||
const T * | stringVals, | ||
T * | stringVals2, | ||
int | blockSize, | ||
size_t | numElements | ||
) |
Sorts blocks of data of size blockSize.
[in,out] | A_keys | keys to be sorted |
[in,out] | A_address | associated values to keys |
[in] | stringVals | BWT string manipulated to words |
[in] | stringVals2 | keys of each of the cyclical rotations |
[in] | blockSize | Size of the chunks being sorted |
[in] | numElements | Size of the enitre array |
__global__ void bwt_keys_construct_kernel | ( | uchar4 * | d_bwtIn, |
uint * | d_bwtInRef, | ||
uint * | d_keys, | ||
uint * | d_values, | ||
uint * | d_bwtInRef2, | ||
uint | tThreads | ||
) |
Massage input to set up for merge sort.
[in] | d_bwtIn | A char array of the input data stream to perform the BWT on. |
[out] | d_bwtInRef | BWT string manipulated to words. |
[out] | d_keys | An array of associated keys to sort by the first four chars of the cyclical rotations. |
[out] | d_values | Array of values associates with the keys to sort. |
[out] | d_bwtInRef2 | keys of each of the cyclical rotations. |
[in] | tThreads | Pointer to the plan object used for this BWT. |
__global__ void mtf_reduction_kernel | ( | const uchar * | d_mtfIn, |
uchar * | d_lists, | ||
ushort * | d_list_sizes, | ||
uint | nLists, | ||
uint | offset, | ||
uint | numElements | ||
) |
First stage in MTF (Reduction)
[in] | d_mtfIn | A char array of the input data stream to perform the MTF on. |
[out] | d_lists | A pointer to the start of MTF lists. |
[out] | d_list_sizes | An array storing the size of each MTF list. |
[in] | nLists | Total number of MTF lists. |
[in] | offset | The offset during the reduction stage. Initialized to two. |
[in] | numElements | Total number of input elements MTF transform. |
__global__ void mtf_GLreduction_kernel | ( | uchar * | d_lists, |
ushort * | d_list_sizes, | ||
uint | offset, | ||
uint | tThreads, | ||
uint | nLists | ||
) |
Second stage in MTF (Global reduction)
[in,out] | d_lists | A pointer to the start of MTF lists. |
[in,out] | d_list_sizes | An array storing the size of each MTF list. |
[in] | offset | The offset during the reduction stage. Initialized to two. |
[in] | tThreads | Total number of threads dispatched. |
[in] | nLists | Total number of MTF lists. |
__global__ void mtf_GLdownsweep_kernel | ( | uchar * | d_lists, |
ushort * | d_list_sizes, | ||
uint | offset, | ||
uint | lastLevel, | ||
uint | nLists, | ||
uint | tThreads | ||
) |
Third stage in MTF (Global downsweep)
[in,out] | d_lists | A pointer to the start of MTF lists. |
[in,out] | d_list_sizes | An array storing the size of each MTF list. |
[in] | offset | The offset during the reduction stage. |
[in] | lastLevel | The limit to which offset can be set to. |
[in] | nLists | Total number of MTF lists. |
[in] | tThreads | Total number of threads dispatched. |
__global__ void mtf_localscan_lists_kernel | ( | const uchar * | d_mtfIn, |
uchar * | d_mtfOut, | ||
uchar * | d_lists, | ||
ushort * | d_list_sizes, | ||
uint | nLists, | ||
uint | offset, | ||
uint | numElements | ||
) |
Compute final MTF lists and final MTF output.
[in] | d_mtfIn | A char array of the input data stream to perform the MTF on. |
[in] | d_mtfOut | A char array of the output with the transformed MTF string. |
[in,out] | d_lists | A pointer to the start of MTF lists. |
[in] | d_list_sizes | An array storing the size of each MTF list. |
[in] | nLists | Total number of MTF lists. |
[in] | offset | The offset during the reduction stage. |
[in] | numElements | Total number of elements to perform the MTF on. |
__global__ void huffman_build_histogram_kernel | ( | uint * | d_input, |
uint * | d_histograms, | ||
uint | numElements | ||
) |
Compute 256-entry histogram.
[in] | d_input | An array of words we will use to build our histogram. |
[out] | d_histograms | A pointer where we store our global histograms. |
[in] | numElements | The total number of elements to build our histogram from. |
__global__ void huffman_build_tree_kernel | ( | const uchar * | d_input, |
uchar * | d_huffCodesPacked, | ||
uint * | d_huffCodeLocations, | ||
uchar * | d_huffCodeLengths, | ||
uint * | d_histograms, | ||
uint * | d_histogram, | ||
uint * | d_nCodesPacked, | ||
uint * | d_totalEncodedSize, | ||
uint | histBlocks, | ||
uint | numElements | ||
) |
Build Huffman tree/codes.
[in] | d_input | An array of input elements to encode |
[out] | d_huffCodesPacked | An array of huffman bit codes packed together |
[out] | d_huffCodeLocations | An array which stores the starting bit locations of each Huffman bit code |
[out] | d_huffCodeLengths | An array which stores the lengths of each Huffman bit code |
[in] | d_histograms | An input array of histograms to combine |
[out] | d_histogram | Final histogram combined |
[out] | d_nCodesPacked | Number of chars it took to store all Huffman bit codes |
[out] | d_totalEncodedSize | Total number of words it takes to hold the compressed data |
[in] | histBlocks | Total number of histograms we will combine into one |
[in] | numElements | Number of elements to compress |
__global__ void huffman_kernel_en | ( | uchar4 * | d_input, |
uchar * | d_codes, | ||
uint * | d_code_locations, | ||
uchar * | d_huffCodeLengths, | ||
encoded * | d_encoded, | ||
uint | nCodesPacked, | ||
uint | nThreads | ||
) |
Perform parallel Huffman encoding.
[in] | d_input | Input array to encode |
[in] | d_codes | Array of packed Huffman bit codes |
[in] | d_code_locations | Array of starting Huffman bit locations |
[in] | d_huffCodeLengths | An array storing the bit lengths of the Huffman codes |
[out] | d_encoded | An array of encoded classes which stores the size and data of encoded data |
[in] | nCodesPacked | Number of chars it took to store all Huffman bit codes |
[in] | nThreads | Total number of dispatched threads |
__global__ void huffman_datapack_kernel | ( | encoded * | d_encoded, |
uint * | d_encodedData, | ||
uint * | d_totalEncodedSize, | ||
uint * | d_eOffsets | ||
) |
Pack together encoded blocks.
[in] | d_encoded | An array of encoded objects with stored size and data of the encoded data. |
[out] | d_encodedData | An in array to store all encoded data. |
[out] | d_totalEncodedSize | Total number words of the encoded data. |
[out] | d_eOffsets | Array holding the word offsets of each encoded data block. |
__global__ void list_rank_kernel_soa_1 | ( | T * | d_ranked_values, |
const T * | d_unranked_values, | ||
const int * | d_ping, | ||
int * | d_pong, | ||
int * | d_start_indices, | ||
int | step, | ||
int | head, | ||
int | numElts | ||
) |
Use pointer jumping to rank values. After ranking the values, calculate the next set of indices. The number of values ranked doubles at each kernel call. Called by listRank().
[out] | d_ranked_values | Ranked values array |
[in] | d_unranked_values | Unranked values array |
[in] | d_ping | Next indices array for the current kernel call |
[in] | d_pong | Next indices array for the next kernel call |
[in] | d_start_indices | Holds the starting node indices for "ranking" threads. The number of "ranking" threads doubles at each stage. |
[in] | step | The number of "ranking" threads. |
[in] | head | Head node index of the linked-list. |
[in] | numElts | Number of nodes to rank |
__global__ void list_rank_kernel_soa_2 | ( | T * | d_ranked_values, |
const T * | d_unranked_values, | ||
const int * | d_pong, | ||
const int * | d_start_indices, | ||
int | head, | ||
int | numElts | ||
) |
After pointer jumping is finished and all threads are able to rank values, ranking continues serially. Each thread ranks values until all values are ranked. Called by listRank().
[out] | d_ranked_values | Ranked values array |
[in] | d_unranked_values | Unranked values array |
[in] | d_pong | Next indices array for the current kernel call |
[in] | d_start_indices | Holds the starting node indices for "ranking" threads. The number of "ranking" threads doubles at each stage. |
[in] | head | Head node index of the linked-list. |
[in] | numElts | Number of nodes to rank |
__global__ void simpleCopy | ( | T * | A_keys_dev, |
unsigned int * | A_vals_dev, | ||
T * | A_keys_out_dev, | ||
unsigned int * | A_vals_out_dev, | ||
int | offset, | ||
int | numElementsToCopy | ||
) |
Copies unused portions of arrays in our ping-pong strategy.
[in] | A_keys_dev,A_vals_dev | The keys and values we will be copying |
[out] | A_keys_out_dev,A_vals_out_dev | The keys and values array we will copy to |
[in] | offset | The offset we are starting to copy from |
[in] | numElementsToCopy | The number of elements we copy starting from the offset |
[in] | A_keys_dev | The keys we will be copying |
[in] | A_vals_dev | The values we will be copying |
[out] | A_keys_out_dev | The destination keys array |
[out] | A_vals_out_dev | The destination values array |
[in] | offset | The offset we are starting to copy from |
[in] | numElementsToCopy | The number of elements we copy starting from the offset |
__global__ void blockWiseSort | ( | T * | A_keys, |
unsigned int * | A_values, | ||
int | blockSize, | ||
size_t | totalSize | ||
) |
Sorts blocks of data of size blockSize.
[in,out] | A_keys | keys to be sorted |
[in,out] | A_values | associated values to keys |
[in] | blockSize | Size of the chunks being sorted |
[in] | totalSize | Size of the enitre array |
__global__ void simpleMerge_lower | ( | T * | A_keys, |
unsigned int * | A_values, | ||
T * | A_keys_out, | ||
unsigned int * | A_values_out, | ||
int | sizePerPartition, | ||
int | size | ||
) |
Merges the indices for the "lower" block (left block)
Utilizes a "ping-pong" strategy
[in] | A_keys | Global array of keys to be merged |
[in] | A_values | Global array of values to be merged |
[out] | A_keys_out | Resulting array of keys merged |
[out] | A_values_out | Resulting array of values merged |
[in] | sizePerPartition | Size of each partition being merged |
[in] | size | Size of total Array being sorted |
__global__ void simpleMerge_higher | ( | T * | A_keys, |
unsigned int * | A_values, | ||
T * | A_keys_out, | ||
unsigned int * | A_values_out, | ||
int | sizePerPartition, | ||
int | size | ||
) |
Merges the indices for the "upper" block (right block)
Utilizes a "ping-pong" strategy
[in] | A_keys | Global array of keys to be merged |
[in] | A_values | Global array of values to be merged |
[out] | A_keys_out | Resulting array of keys merged |
[out] | A_values_out | Resulting array of values merged |
[in] | sizePerPartition | Size of each partition being merged |
[in] | size | Size of total Array being sorted |
__global__ void findMultiPartitions | ( | T * | A, |
int | splitsPP, | ||
int | numPartitions, | ||
int | partitionSize, | ||
int * | partitionBeginA, | ||
int * | partitionSizesA, | ||
int | sizeA | ||
) |
Merges the indices for the "upper" block (right block)
Utilizes a "ping-pong" strategy
[in] | A | Global array of keys |
[in] | splitsPP | Global array of values to be merged |
[in] | numPartitions | number of partitions being considered |
[in] | partitionSize | Size of each partition being considered |
[out] | partitionBeginA | Where each partition/subpartition will begin in A |
[out] | partitionSizesA | Size of each partition/subpartition in A |
[in] | sizeA | Size of the entire array |
__global__ void mergeMulti_lower | ( | T * | A_keys_out, |
unsigned int * | A_vals_out, | ||
T * | A_keys, | ||
unsigned int * | A_vals, | ||
int | subPartitions, | ||
int | numBlocks, | ||
int * | partitionBeginA, | ||
int * | partitionSizeA, | ||
int | entirePartitionSize, | ||
int | sizeA | ||
) |
Blocks cooperatively Merge two partitions for the indices in the "lower" block (left block)
Utilizes a "ping-pong" strategy
[out] | A_keys_out | Resulting array of keys merged |
[out] | A_vals_out | Resulting array of values merged |
[in] | A_keys | Global array of keys to be merged |
[in] | A_vals | Global array of values to be merged |
[in] | subPartitions | Number of blocks working on a partition (number of sub-partitions) |
[in] | numBlocks | |
[in] | partitionBeginA | Partition starting points decided by function findMultiPartitions |
[in] | partitionSizeA | Partition sizes decided by function findMultiPartitions |
[in] | entirePartitionSize | The size of an entire partition (before it is split up) |
[in] | sizeA | The total size of our array |
__global__ void mergeMulti_higher | ( | T * | A_keys_out, |
unsigned int * | A_vals_out, | ||
T * | A_keys, | ||
unsigned int * | A_vals, | ||
int | subPartitions, | ||
int | numBlocks, | ||
int * | partitionBeginA, | ||
int * | partitionSizeA, | ||
int | entirePartitionSize, | ||
int | sizeA | ||
) |
Blocks cooperatively Merge two partitions for the indices in the "upper" block (right block)
Utilizes a "ping-pong" strategy
[out] | A_keys_out | Resulting array of keys merged |
[out] | A_vals_out | Resulting array of values merged |
[in] | A_keys | Global array of keys to be merged |
[in] | A_vals | Global array of values to be merged |
[in] | subPartitions | Number of blocks working on a partition (number of sub-partitions) |
[in] | numBlocks | |
[in] | partitionBeginA | Partition starting points decided by function findMultiPartitions |
[in] | partitionSizeA | Partition sizes decided by function findMultiPartitions |
[in] | entirePartitionSize | The size of an entire partition (before it is split up) |
[in] | sizeA | The total size of our array |
__global__ void flipFloats | ( | uint * | values, |
uint | numValues | ||
) |
Does special binary arithmetic before sorting floats.
Uses floatFlip function to flip bits.
[in,out] | values | Values to be manipulated |
[in] | numValues | Number of values to be flipped |
__global__ void unflipFloats | ( | uint * | values, |
uint | numValues | ||
) |
Undoes the flips from flipFloats.
Uses floatUnflip function to unflip bits.
[in,out] | values | Values to be manipulated |
[in] | numValues | Number of values to be unflipped |
__global__ void radixSortSingleWarp | ( | uint * | keys, |
uint * | values, | ||
uint | numElements | ||
) |
Optimization for sorts of WARP_SIZE or fewer elements.
[in,out] | keys | Keys to be sorted. |
[in,out] | values | Associated values to be sorted (through keys). |
[in] | numElements | Number of elements in the sort. |
__global__ void radixSortSingleWarpKeysOnly | ( | uint * | keys, |
uint | numElements | ||
) |
Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.
[in,out] | keys | Keys to be sorted |
[in] | numElements | Total number of elements to be sorted |
__global__ void radixSortBlocks | ( | uint4 * | keysOut, |
uint4 * | valuesOut, | ||
uint4 * | keysIn, | ||
uint4 * | valuesIn, | ||
uint | numElements, | ||
uint | totalBlocks | ||
) |
sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
The radix sort is done in two stages. This stage calls radixSortBlock on each block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "flip" is used to only compile in the float flip code when float keys are used. "loop" is used when persistent CTAs are used.
By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.
[out] | keysOut | Output of sorted keys |
[out] | valuesOut | Output of associated values |
[in] | keysIn | Input of unsorted keys in GPU |
[in] | valuesIn | Input of associated input values |
[in] | numElements | Total number of elements to sort |
[in] | totalBlocks | The number of blocks of data to sort |
__global__ void findRadixOffsets | ( | uint2 * | keys, |
uint * | counters, | ||
uint * | blockOffsets, | ||
uint | numElements, | ||
uint | totalBlocks | ||
) |
Computes the number of keys of each radix in each block stores offset.
Given an array with blocks sorted according to a 4-bit radix group, each block counts the number of keys that fall into each radix in the group, and finds the starting offset of each radix in the block. It then writes the radix counts to the counters array, and the starting offsets to the blockOffsets array.
Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.
By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.
[in] | keys | Input keys |
[out] | counters | Radix count for each block |
[out] | blockOffsets | The offset address for each block |
[in] | numElements | Total number of elements |
[in] | totalBlocks | Total number of blocks |
__global__ void reorderData | ( | uint * | outKeys, |
uint * | outValues, | ||
uint2 * | keys, | ||
uint2 * | values, | ||
uint * | blockOffsets, | ||
uint * | offsets, | ||
uint * | sizes, | ||
uint | numElements, | ||
uint | totalBlocks | ||
) |
Reorders data in the global array.
reorderData shuffles data in the array globally after the radix offsets have been found. On compute version 1.1 and earlier GPUs, this code depends on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures that all writes are coalesced using extra work in the kernel. On later GPUs coalescing rules have been relaxed, so this extra overhead hurts performance. On these GPUs we set manualCoalesce=false and directly store the results.
Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.
By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.
[out] | outKeys | Output of sorted keys |
[out] | outValues | Output of associated values |
[in] | keys | Input of unsorted keys in GPU |
[in] | values | Input of associated input values |
[in] | blockOffsets | The offset address for each block |
[in] | offsets | Address of each radix within each block |
[in] | sizes | Number of elements in a block |
[in] | numElements | Total number of elements |
[in] | totalBlocks | Total number of data blocks to process |
__global__ void radixSortBlocksKeysOnly | ( | uint4 * | keysOut, |
uint4 * | keysIn, | ||
uint | numElements, | ||
uint | totalBlocks | ||
) |
Sorts all blocks of data independently in shared memory. Each thread block (CTA) sorts one block of 4*CTA_SIZE elements.
The radix sort is done in two stages. This stage calls radixSortBlock on each block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "flip" is used to only compile in the float flip code when float keys are used. "loop" is used when persistent CTAs are used.
By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.
[out] | keysOut | Output of sorted keys GPU main memory |
[in] | keysIn | Input of unsorted keys in GPU main memory |
[in] | numElements | Total number of elements to sort |
[in] | totalBlocks | Total number of blocks to sort |
__global__ void reorderDataKeysOnly | ( | uint * | outKeys, |
uint2 * | keys, | ||
uint * | blockOffsets, | ||
uint * | offsets, | ||
uint * | sizes, | ||
uint | numElements, | ||
uint | totalBlocks | ||
) |
Reorders data in the global array.
reorderDataKeysOnly shuffles data in the array globally after the radix offsets have been found. On compute version 1.1 and earlier GPUs, this code depends on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures that all writes are coalesced using extra work in the kernel. On later GPUs coalescing rules have been relaxed, so this extra overhead hurts performance. On these GPUs we set manualCoalesce=false and directly store the results.
Template parameters are used to generate efficient code for various special cases For example, we have to handle arrays that are a multiple of the block size (fullBlocks) differently than arrays that are not. "loop" is used when persistent CTAs are used.
By persistent CTAs we mean that we launch only as many thread blocks as can be resident in the GPU and no more, rather than launching as many threads as we have elements. Persistent CTAs loop over blocks of elements until all work is complete. This can be faster in some cases. In our tests it is faster for large sorts (and the threshold is higher on compute version 1.1 and earlier GPUs than it is on compute version 1.2 GPUs.
[out] | outKeys | Output result of reorderDataKeysOnly() |
[in] | keys | Keys to be reordered |
[in] | blockOffsets | Start offset for each block |
[in] | offsets | Offset of each radix within each block |
[in] | sizes | Number of elements in a block |
[in] | numElements | Total number of elements |
[in] | totalBlocks | Total number of blocks |
__global__ void gen_randMD5 | ( | uint4 * | d_out, |
size_t | numElements, | ||
unsigned int | seed | ||
) |
The main MD5 generation algorithm.
This function runs the MD5 hashing random number generator. It generates MD5 hashes, and uses the output as randomized bits. To repeatedly call this function, always call cudppRandSeed() first to set a new seed or else the output may be the same due to the deterministic nature of hashes. gen_randMD5 generates 128 random bits per thread. Therefore, the parameter d_out is expected to be an array of type uint4 with numElements indicies.
[out] | d_out | the output array of type uint4. |
[in] | numElements | the number of elements in d_out |
[in] | seed | the random seed used to vary the output |
__global__ void reduce | ( | T * | odata, |
const T * | idata, | ||
unsigned int | n | ||
) |
Main reduction kernel.
This reduction kernel adds multiple elements per thread sequentially, and then the threads work together to produce a block sum in shared memory. The code is optimized using warp-synchronous programming to eliminate unnecessary barrier synchronization. Performing sequential work in each thread before performing the log(N) parallel summation reduces the overall cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). (Brent's Theorem optimization)
[out] | odata | The output data pointer. Each block writes a single output element. |
[in] | idata | The input data pointer. |
[in] | n | The number of elements to be reduced. |
__global__ void scan4 | ( | T * | d_out, |
const T * | d_in, | ||
T * | d_blockSums, | ||
int | numElements, | ||
unsigned int | dataRowPitch, | ||
unsigned int | blockSumRowPitch | ||
) |
Main scan kernel.
This global device function performs one level of a multiblock scan on an arbitrary-dimensioned array in d_in, returning the result in d_out (which may point to the same array). The same function may be used for single or multi-row scans. To perform a multirow scan, pass the width of each row of the input row (in elements) in dataRowPitch, and the width of the rows of d_blockSums (in elements) in blockSumRowPitch, and invoke with a thread block grid with height greater than 1.
This function peforms one level of a recursive, multiblock scan. At the app level, this function is called by cudppScan and cudppMultiScan and used in combination with vectorAddUniform4() to produce a complete scan.
Template parameter T is the datatype of the array to be scanned. Template parameter traits is the ScanTraits struct containing compile-time options for the scan, such as whether it is forward or backward, exclusive or inclusive, multi- or single-row, etc.
[out] | d_out | The output (scanned) array |
[in] | d_in | The input array to be scanned |
[out] | d_blockSums | The array of per-block sums |
[in] | numElements | The number of elements to scan |
[in] | dataRowPitch | The width of each row of d_in in elements (for multi-row scans) |
[in] | blockSumRowPitch | The with of each row of d_blockSums in elements (for multi-row scans) |
__global__ void segmentedScan4 | ( | T * | d_odata, |
const T * | d_idata, | ||
const unsigned int * | d_iflags, | ||
unsigned int | numElements, | ||
T * | d_blockSums = 0 , |
||
unsigned int * | d_blockFlags = 0 , |
||
unsigned int * | d_blockIndices = 0 |
||
) |
Main segmented scan kernel.
This global device function performs one level of a multiblock segmented scan on an one-dimensioned array in d_idata, returning the result in d_odata (which may point to the same array).
This function performs one level of a recursive, multiblock scan. At the app level, this function is called by cudppSegmentedScan and used in combination with either vectorSegmentedAddUniform4() (forward) or vectorSegmentedAddUniformToRight4() (backward) to produce a complete segmented scan.
Template parameter T is the datatype of the array to be scanned. Template parameter traits is the SegmentedScanTraits struct containing compile-time options for the segmented scan, such as whether it is forward or backward, inclusive or exclusive, etc.
[out] | d_odata | The output (scanned) array |
[in] | d_idata | The input array to be scanned |
[in] | d_iflags | The input array of flags |
[out] | d_blockSums | The array of per-block sums |
[out] | d_blockFlags | The array of per-block OR-reduction of flags |
[out] | d_blockIndices | The array of per-block min-reduction of indices |
[in] | numElements | The number of elements to scan |
__global__ void sparseMatrixVectorFetchAndMultiply | ( | unsigned int * | d_flags, |
T * | d_prod, | ||
const T * | d_A, | ||
const T * | d_x, | ||
const unsigned int * | d_indx, | ||
unsigned int | numNZElts | ||
) |
Fetch and multiply kernel.
This global device function takes an element from the vector d_A, finds its column in d_indx and multiplies the element from d_A with its corresponding (that is having the same row) element in d_x and stores the resulting product in d_prod. It also sets all the elements of d_flags to 0.
Template parameter T is the datatype of the matrix A and x.
[out] | d_flags | The output flags array |
[out] | d_prod | The output products array |
[in] | d_A | The input matrix A |
[in] | d_x | The input array x |
[in] | d_indx | The input array of column indices for each element in A |
[in] | numNZElts | The number of non-zero elements in matrix A |
__global__ void sparseMatrixVectorSetFlags | ( | unsigned int * | d_flags, |
const unsigned int * | d_rowindx, | ||
unsigned int | numRows | ||
) |
Set Flags kernel.
This global device function takes an element from the vector d_rowindx, and sets the corresponding position in d_flags to 1
[out] | d_flags | The output flags array |
[in] | d_rowindx | The starting index of each row in the "flattened" version of matrix A |
[in] | numRows | The number of rows in matrix A |
__global__ void yGather | ( | T * | d_y, |
const T * | d_prod, | ||
const unsigned int * | d_rowFindx, | ||
unsigned int | numRows | ||
) |
Gather final y values kernel.
This global device function takes an element from the vector d_rowFindx, which for each row gives the index of the last element of that row, reads the corresponding position in d_prod and write it in d_y
Template parameter T is the datatype of the matrix A and x.
[out] | d_y | The output result array |
[in] | d_prod | The input products array (which now contains sums for each row) |
[in] | d_rowFindx | The starting index of each row in the "flattened" version of matrix A |
[in] | numRows | The number of rows in matrix A |
__global__ void alignedOffsets | ( | unsigned int * | numSpaces, |
unsigned int * | d_address, | ||
unsigned char * | d_stringVals, | ||
unsigned char | termC, | ||
unsigned int | numElements, | ||
unsigned int | stringSize | ||
) |
Calculate the number of spaces required for each string to align the string array.
[out] | numSpaces | Number of spaces required for each string |
[in] | d_address | Input addresses of each string |
[in] | d_stringVals | String array |
[in] | address | Input addresses of unpacked strings |
[in] | termC | Termination character for the strings |
[in] | numElements | Number of strings |
[in] | stringSize | Number of characters in the string array |
__global__ void alignString | ( | unsigned int * | packedStrings, |
unsigned char * | d_stringVals, | ||
unsigned int * | packedAddress, | ||
unsigned int * | address, | ||
unsigned int | numElements, | ||
unsigned int | stringArrayLength, | ||
unsigned char | termC | ||
) |
Packs strings into unsigned ints to be sorted later. These packed strings will also be aligned.
[out] | packedStrings | Resulting packed strings. |
[in] | d_stringVals | Unpacked string array which we will pack |
[out] | packedAddresses | Resulting addresses for each string to the packedStrings array |
[in] | address | Input addresses of unpacked strings |
[in] | numElements | Number of strings |
[in] | stringArrayLength | Number of characters in the string array |
[in] | termC | Termination character for the strings |
__global__ void createKeys | ( | unsigned int * | d_keys, |
unsigned int * | packedStrings, | ||
unsigned int * | packedAddress, | ||
unsigned int | numElements | ||
) |
Create keys (first four characters stuffed in an uint) from the addresses to the strings, and the string array.
[out] | d_keys | Resulting keys |
[in] | packedStrings | Packed string array. |
[in] | packedAddress | Addresses which point to the string array. |
[in] | numElements | Number of strings |
__global__ void unpackAddresses | ( | unsigned int * | packedAddress, |
unsigned int * | packedAddressRef, | ||
unsigned int * | address, | ||
unsigned int * | addressRef, | ||
size_t | numElements | ||
) |
Converts addresses from packed (unaligned) form to unpacked and unaligned form Resulting aligned strings begin in our string array packed in an unsigned int and aligned such that each string begins at the start of a uint (divisible by 4)
[in] | packedAddress | Resulting packed addresses that have been sorted. All strings are aligned. |
[in] | packedAddressRef | Original array after packing (before sort). Used as a reference. |
[out] | address | Final output of sorted addresses in unpacked form. |
[in] | addressRef | Reference array of original unpacked addresses. |
[in] | numElements | Number of strings |
__global__ void blockWiseStringSort | ( | T * | A_keys, |
T * | A_address, | ||
T * | stringVals, | ||
int | blockSize, | ||
int | totalSize, | ||
unsigned int | stringSize, | ||
unsigned char | termC | ||
) |
Does an initial blockSort based on the size of our partition (limited by shared memory size)
[in,out] | A_keys,A_address | This sort is in-place. A_keys and A_address store the key (first four characters) and addresses of our strings |
[in] | stringVals | Global array of strings for tie breaks |
[in] | blockSize | size of each block |
[in] | totalSize | The total size of the array we are sorting |
[in] | stringSize | The size of our string array (stringVals) |
__global__ void simpleStringMerge | ( | T * | A_keys, |
T * | A_keys_out, | ||
T * | A_values, | ||
T * | A_values_out, | ||
T * | stringValues, | ||
int | sizePerPartition, | ||
int | size, | ||
int | step, | ||
int | stringSize, | ||
unsigned char | termC | ||
) |
Merges two independent sets. Each CUDA block works on two partitions of data without cooperating.
[in] | A_keys | First four characters (input) of our sets to merge |
[in] | A_values | Addresses of the strings (for tie breaks) |
[in] | stringValues | Global string array for tie breaks |
[out] | A_keys_out,A_values_out | Keys and values array after merge step |
[in] | sizePerPartition | The size of each partition for this merge step |
[in] | size | Global size of our array |
[in] | step | Number of merges done so far |
[in] | stringSize | global string length |
__global__ void findMultiPartitions | ( | T * | A_keys, |
T * | A_address, | ||
T * | stringValues, | ||
int | splitsPP, | ||
int | numPartitions, | ||
int | partitionSize, | ||
unsigned int * | partitionBeginA, | ||
unsigned int * | partitionSizesA, | ||
unsigned int * | partitionBeginB, | ||
unsigned int * | partitionSizesB, | ||
size_t | size, | ||
size_t | stringSize, | ||
unsigned char | termC | ||
) |
For our multiMerge kernels we need to divide our partitions into smaller partitions. This kernel breaks up a set of partitions into splitsPP*numPartitions subpartitions.
[in] | A_keys,A_address | First four characters (input), and addresses of our inputs |
[in] | stringValues | Global string array for tie breaks |
[in] | splitsPP,numPartitions,partitionSize | Partition information for this routine (splitsPP=splits Per Partition) |
[in] | partitionBeginA,partitionSizesA | Partition starting points and sizes for each new subpartition in our original set in A |
[in] | partitionBeginB,partitionSizesB | Partition starting points and sizes for each new subpartition in our original set in B |
[in] | size,stringSize | Number of elements in our set, and size of our global string array |
__global__ void stringMergeMulti | ( | T * | A_keys, |
T * | A_keys_out, | ||
T * | A_values, | ||
T * | A_values_out, | ||
T * | stringValues, | ||
int | subPartitions, | ||
int | numBlocks, | ||
unsigned int * | partitionBeginA, | ||
unsigned int * | partitionSizeA, | ||
unsigned int * | partitionBeginB, | ||
unsigned int * | partitionSizeB, | ||
int | entirePartitionSize, | ||
int | step, | ||
size_t | size, | ||
size_t | stringSize, | ||
unsigned char | termC | ||
) |
Main merge kernel where multiple CUDA blocks cooperate to merge a partition(s)
[in] | A_keys,A_values | First four characters (input), and addresses of our inputs |
[out] | A_keys_out,A_values_out | First four characters, and addresses for our outputs(ping-pong) |
[in] | stringValues | string array for tie breaks |
[out] | subPartitions,numBlocks | Number of splits per partitions and number of partitions respectively |
[in] | partitionBeginA,partitionSizeA | Where partitions begin and how large they are for Segment A |
[in] | partitionBeginB,partitionSizeB | Where partitions begin and how large they are for Segment B |
[in] | entirePartitionSize | The maximum length of a partition |
[in] | step | Number of merge cycles done |
[in] | size | Number of total strings being sorted |
[in] | stringSize | Length of string array |
__global__ void crpcrKernel | ( | T * | d_a, |
T * | d_b, | ||
T * | d_c, | ||
T * | d_d, | ||
T * | d_x, | ||
unsigned int | systemSizeOriginal, | ||
unsigned int | iterations | ||
) |
Hybrid CR-PCR Tridiagonal linear system solver (CRPCR)
This kernel solves a tridiagonal linear system using a hybrid CR-PCR algorithm. The solver first reduces the system size using cyclic reduction, then solves the intermediate system using parallel cyclic reduction to reduce shared memory bank conflicts and algorithmic steps, and finally switches back to cyclic reduction to solve all unknowns.
[out] | d_x | Solution vector |
[in] | d_a | Lower diagonal |
[in] | d_b | Main diagonal |
[in] | d_c | Upper diagonal |
[in] | d_d | Right hand side |
[in] | systemSizeOriginal | The size of each system |
[in] | iterations | The computed number of PCR iterations |
__global__ void vectorAddConstant | ( | T * | d_vector, |
T | constant, | ||
int | n, | ||
int | baseIndex | ||
) |
Adds a constant value to all values in the input d_vector.
Each thread adds two pairs of elements.
[in,out] | d_vector | The array of elements to be modified |
[in] | constant | The constant value to be added to elements of d_vector |
[in] | n | The number of elements in the d_vector to be modified |
[in] | baseIndex | An optional offset to the beginning of the elements in the input array to be processed |
__global__ void vectorAddUniform | ( | T * | d_vector, |
const T * | d_uniforms, | ||
int | numElements, | ||
int | blockOffset, | ||
int | baseIndex | ||
) |
Add a uniform value to each data element of an array.
This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds two pairs of values.
[out] | d_vector | The d_vector whose values will have the uniform added |
[in] | d_uniforms | The array of uniform values (one per CTA) |
[in] | numElements | The number of elements in d_vector to process |
[in] | blockOffset | an optional offset to the beginning of this block's data. |
[in] | baseIndex | an optional offset to the beginning of the array within d_vector. |
__global__ void vectorAddUniform4 | ( | T * | d_vector, |
const T * | d_uniforms, | ||
int | numElements, | ||
int | vectorRowPitch, | ||
int | uniformRowPitch, | ||
int | blockOffset, | ||
int | baseIndex | ||
) |
Add a uniform value to each data element of an array (vec4 version)
This function reads one value per CTA from d_uniforms into shared memory and adds that value to all values "owned" by the CTA in d_vector. Each thread adds the uniform value to eight values in d_vector.
[out] | d_vector | The d_vector whose values will have the uniform added |
[in] | d_uniforms | The array of uniform values (one per CTA) |
[in] | numElements | The number of elements in d_vector to process |
[in] | vectorRowPitch | For 2D arrays, the pitch (in elements) of the rows of d_vector. |
[in] | uniformRowPitch | For 2D arrays, the pitch (in elements) of the rows of d_uniforms. |
[in] | blockOffset | an optional offset to the beginning of this block's data. |
[in] | baseIndex | an optional offset to the beginning of the array within d_vector. |
__global__ void vectorAddVector | ( | T * | d_vectorA, |
const T * | d_vectorB, | ||
int | numElements, | ||
int | baseIndex | ||
) |
Adds together two vectors.
Each thread adds two pairs of elements.
[out] | d_vectorA | The left operand array and the result |
[in] | d_vectorB | The right operand array |
[in] | numElements | The number of elements in the vectors to be added. |
[in] | baseIndex | An optional offset to the beginning of the elements in the input arrays to be processed |
__global__ void vectorSegmentedAddUniform4 | ( | T * | d_vector, |
const T * | d_uniforms, | ||
const unsigned int * | d_maxIndices, | ||
unsigned int | numElements, | ||
int | blockOffset, | ||
int | baseIndex | ||
) |
Add a uniform value to data elements of an array (vec4 version)
This function reads one value per CTA from d_uniforms into shared memory and adds that value to values "owned" by the CTA in d_vector. The uniform value is added to only those values "owned" by the CTA which have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX it adds the uniform to all values "owned" by the CTA. Each thread adds the uniform value to eight values in d_vector.
[out] | d_vector | The d_vector whose values will have the uniform added |
[in] | d_uniforms | The array of uniform values (one per CTA) |
[in] | d_maxIndices | The array of maximum indices (one per CTA). This is index upto which the uniform would be added. If this is UINT_MAX the uniform is added to all elements of the CTA. This index is 1-based. |
[in] | numElements | The number of elements in d_vector to process |
[in] | blockOffset | an optional offset to the beginning of this block's data. |
[in] | baseIndex | an optional offset to the beginning of the array within d_vector. |
__global__ void vectorSegmentedAddUniformToRight4 | ( | T * | d_vector, |
const T * | d_uniforms, | ||
const unsigned int * | d_minIndices, | ||
unsigned int | numElements, | ||
int | blockOffset, | ||
int | baseIndex | ||
) |
Add a uniform value to data elements of an array (vec4 version)
This function reads one value per CTA from d_uniforms into shared memory and adds that value to values "owned" by the CTA in d_vector. The uniform value is added to only those values "owned" by the CTA which have an index greater than d_minIndex. If d_minIndex for that CTA is 0 it adds the uniform to all values "owned" by the CTA. Each thread adds the uniform value to eight values in d_vector.
[out] | d_vector | The d_vector whose values will have the uniform added |
[in] | d_uniforms | The array of uniform values (one per CTA) |
[in] | d_minIndices | The array of minimum indices (one per CTA). The uniform is added to the right of this index (that is, to every index that is greater than this index). If this is 0, the uniform is added to all elements of the CTA. This index is 1-based to prevent overloading of what 0 means. In our case it means absence of a flag. But if the first element of a CTA has flag the index will also be 0. Hence we use 1-based indices so the index is 1 in the latter case. |
[in] | numElements | The number of elements in d_vector to process |
[in] | blockOffset | an optional offset to the beginning of this block's data. |
[in] | baseIndex | an optional offset to the beginning of the array within d_vector. |