CUDPP 2.0
CUDA Data-Parallel Primitives Library
|
Classes | |
class | CudaHT::CuckooHashing::HashTable |
Basic hash table that stores one value for each key. More... | |
Modules | |
Hash Table Data Structures and Constants | |
Compact Functions | |
void | calculateCompactLaunchParams (const unsigned int numElements, unsigned int &numThreads, unsigned int &numBlocks, unsigned int &numEltsPerBlock) |
Calculate launch parameters for compactArray(). | |
template<class T > | |
void | compactArray (T *d_out, size_t *d_numValidElements, const T *d_in, const unsigned int *d_isValid, size_t numElements, const CUDPPCompactPlan *plan) |
Compact the non-zero elements of an array. | |
void | allocCompactStorage (CUDPPCompactPlan *plan) |
Allocate intermediate arrays used by cudppCompact(). | |
void | freeCompactStorage (CUDPPCompactPlan *plan) |
Deallocate intermediate storage used by cudppCompact(). | |
void | cudppCompactDispatch (void *d_out, size_t *d_numValidElements, const void *d_in, const unsigned int *d_isValid, size_t numElements, const CUDPPCompactPlan *plan) |
Dispatch compactArray for the specified datatype. | |
RadixSort Functions | |
void | allocRadixSortStorage (CUDPPRadixSortPlan *plan) |
From the programmer-specified sort configuration, creates internal memory for performing the sort. | |
void | freeRadixSortStorage (CUDPPRadixSortPlan *plan) |
Deallocates intermediate memory from allocRadixSortStorage. | |
template<typename T > | |
void | runSort (T *pkeys, unsigned int *pvals, size_t numElements, const CUDPPRadixSortPlan *plan) |
void | cudppRadixSortDispatch (void *keys, void *values, size_t numElements, const CUDPPRadixSortPlan *plan) |
Dispatch function to perform a sort on an array with a specified configuration. | |
Scan Functions | |
template<class T , bool isBackward, bool isExclusive, class Op > | |
void | scanArrayRecursive (T *d_out, const T *d_in, T **d_blockSums, size_t numElements, size_t numRows, const size_t *rowPitches, int level) |
Perform recursive scan on arbitrary size arrays. | |
void | allocScanStorage (CUDPPScanPlan *plan) |
Allocate intermediate arrays used by scan. | |
void | freeScanStorage (CUDPPScanPlan *plan) |
Deallocate intermediate block sums arrays in a CUDPPScanPlan object. | |
template<typename T , bool isBackward, bool isExclusive> | |
void | cudppScanDispatchOperator (void *d_out, const void *d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan) |
template<bool isBackward, bool isExclusive> | |
void | cudppScanDispatchType (void *d_out, const void *d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan) |
void | cudppScanDispatch (void *d_out, const void *d_in, size_t numElements, size_t numRows, const CUDPPScanPlan *plan) |
Dispatch function to perform a scan (prefix sum) on an array with the specified configuration. | |
Tridiagonal functions | |
template<typename T > | |
unsigned int | crpcrSharedSize (unsigned int systemSizeOriginal) |
template<typename T > | |
void | crpcr (T *d_a, T *d_b, T *d_c, T *d_d, T *d_x, unsigned int systemSizeOriginal, unsigned int numSystems) |
Hybrid CR-PCR solver (CRPCR) | |
CUDPPResult | cudppTridiagonalDispatch (void *d_a, void *d_b, void *d_c, void *d_d, void *d_x, int systemSize, int numSystems, const CUDPPTridiagonalPlan *plan) |
Dispatches the tridiagonal function based on the plan. |
The CUDPP Application-Level API contains functions that run on the host CPU and invoke GPU routines in the CUDPP Kernel-Level API. Application-Level API functions are used by CUDPP Public Interface functions to implement CUDPP's core functionality.
void calculateCompactLaunchParams | ( | const unsigned int | numElements, |
unsigned int & | numThreads, | ||
unsigned int & | numBlocks, | ||
unsigned int & | numEltsPerBlock | ||
) |
Calculate launch parameters for compactArray().
Calculates the block size and number of blocks from the total number of elements and the maximum threads per block. Called by compactArray().
The calculation is pretty straightforward - the number of blocks is calculated by dividing the number of input elements by the product of the number of threads in each CTA and the number of elements each thread will process. numThreads and numEltsPerBlock are also simple to calculate. Please note that in cases where numElements is not an exact multiple of SCAN_ELTS_PER_THREAD * CTA_SIZE we would have threads which do nothing or have a thread which will process less than SCAN_ELTS_PER_THREAD elements.
[in] | numElements | Number of elements to sort |
[out] | numThreads | Number of threads in each block |
[out] | numBlocks | Number of blocks |
[out] | numEltsPerBlock | Number of elements processed per block |
void compactArray | ( | T * | d_out, |
size_t * | d_numValidElements, | ||
const T * | d_in, | ||
const unsigned int * | d_isValid, | ||
size_t | numElements, | ||
const CUDPPCompactPlan * | plan | ||
) |
Compact the non-zero elements of an array.
Given an input array d_in, compactArray() outputs a compacted version which does not have null (zero) elements. Also ouputs the number of non-zero elements in the compacted array. Called by cudppCompactDispatch().
The algorithm is straightforward, involving two steps (most of the complexity is hidden in scan, invoked with cudppScanDispatch() ).
[out] | d_out | Array of compacted non-null elements |
[out] | d_numValidElements | Pointer to unsigned int to store number of non-null elements |
[in] | d_in | Input array |
[out] | d_isValid | Array of flags, 1 for each non-null element, 0 for each null element. Same length as d_in |
[in] | numElements | Number of elements in input array |
[in] | plan | Pointer to the plan object used for this compact |
void allocCompactStorage | ( | CUDPPCompactPlan * | plan | ) |
Allocate intermediate arrays used by cudppCompact().
In addition to the internal CUDPPScanPlan contained in CUDPPCompactPlan, CUDPPCompact also needs a temporary device array of output indices, which is allocated by this function.
plan | Pointer to CUDPPCompactPlan object within which intermediate storage is allocated. |
void freeCompactStorage | ( | CUDPPCompactPlan * | plan | ) |
Deallocate intermediate storage used by cudppCompact().
Deallocates the output indices array allocated by allocCompactStorage().
plan | Pointer to CUDPPCompactPlan object initialized by allocCompactStorage(). |
void cudppCompactDispatch | ( | void * | d_out, |
size_t * | d_numValidElements, | ||
const void * | d_in, | ||
const unsigned int * | d_isValid, | ||
size_t | numElements, | ||
const CUDPPCompactPlan * | plan | ||
) |
Dispatch compactArray for the specified datatype.
A thin wrapper on top of compactArray which calls compactArray() for the data type specified in config. This is the app-level interface to compact used by cudppCompact().
[out] | d_out | Compacted array of non-zero elements |
[out] | d_numValidElements | Pointer to an unsigned int to store the number of non-zero elements |
[in] | d_in | Input array |
[in] | d_isValid | Array of boolean valid flags with same length as d_in |
[in] | numElements | Number of elements to compact |
[in] | plan | Pointer to plan object for this compact |
void allocRadixSortStorage | ( | CUDPPRadixSortPlan * | plan | ) |
From the programmer-specified sort configuration, creates internal memory for performing the sort.
[in] | plan | Pointer to CUDPPRadixSortPlan object |
void freeRadixSortStorage | ( | CUDPPRadixSortPlan * | plan | ) |
Deallocates intermediate memory from allocRadixSortStorage.
[in] | plan | Pointer to CUDPPRadixSortPlan object |
void cudppRadixSortDispatch | ( | void * | keys, |
void * | values, | ||
size_t | numElements, | ||
const CUDPPRadixSortPlan * | plan | ||
) |
Dispatch function to perform a sort on an array with a specified configuration.
This is the dispatch routine which calls radixSort...() with appropriate template parameters and arguments as specified by the plan.
[in,out] | keys | Keys to be sorted. |
[in,out] | values | Associated values to be sorted (through keys). |
[in] | numElements | Number of elements in the sort. |
[in] | plan | Configuration information for RadixSort. |
void scanArrayRecursive | ( | T * | d_out, |
const T * | d_in, | ||
T ** | d_blockSums, | ||
size_t | numElements, | ||
size_t | numRows, | ||
const size_t * | rowPitches, | ||
int | level | ||
) |
Perform recursive scan on arbitrary size arrays.
This is the CPU-side workhorse function of the scan engine. This function invokes the CUDA kernels which perform the scan on individual blocks.
Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans, where each block is scanned by a single CUDA thread block. At each recursive level of the scanArrayRecursive first invokes a kernel to scan all blocks of that level, and if the level has more than one block, it calls itself recursively. On returning from each recursive level, the total sum of each block from the level below is added to all elements of the corresponding block in this level. See "Parallel Prefix Sum (Scan) in CUDA" for more information (see References ).
Template parameter T is the datatype; isBackward specifies backward or forward scan; isExclusive specifies exclusive or inclusive scan, and op specifies the binary associative operator to be used.
[out] | d_out | The output array for the scan results |
[in] | d_in | The input array to be scanned |
[out] | d_blockSums | Array of arrays of per-block sums (one array per recursive level, allocated by allocScanStorage()) |
[in] | numElements | The number of elements in the array to scan |
[in] | numRows | The number of rows in the array to scan |
[in] | rowPitches | Array of row pitches (one array per recursive level, allocated by allocScanStorage()) |
[in] | level | The current recursive level of the scan |
void allocScanStorage | ( | CUDPPScanPlan * | plan | ) |
Allocate intermediate arrays used by scan.
Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans, where each block is scanned by a single CUDA thread block. At each recursive level of the scan, we need an array in which to store the total sums of all blocks in that level. This function computes the amount of storage needed and allocates it.
plan | Pointer to CUDPPScanPlan object containing options and number of elements, which is used to compute storage requirements, and within which intermediate storage is allocated. |
void freeScanStorage | ( | CUDPPScanPlan * | plan | ) |
Deallocate intermediate block sums arrays in a CUDPPScanPlan object.
These arrays must have been allocated by allocScanStorage(), which is called by the constructor of cudppScanPlan().
plan | Pointer to CUDPPScanPlan object initialized by allocScanStorage(). |
void cudppScanDispatch | ( | void * | d_out, |
const void * | d_in, | ||
size_t | numElements, | ||
size_t | numRows, | ||
const CUDPPScanPlan * | plan | ||
) |
Dispatch function to perform a scan (prefix sum) on an array with the specified configuration.
This is the dispatch routine which calls scanArrayRecursive() with appropriate template parameters and arguments to achieve the scan as specified in plan.
[out] | d_out | The output array of scan results |
[in] | d_in | The input array |
[in] | numElements | The number of elements to scan |
[in] | numRows | The number of rows to scan in parallel |
[in] | plan | Pointer to CUDPPScanPlan object containing scan options and intermediate storage |
void crpcr | ( | T * | d_a, |
T * | d_b, | ||
T * | d_c, | ||
T * | d_d, | ||
T * | d_x, | ||
unsigned int | systemSizeOriginal, | ||
unsigned int | numSystems | ||
) |
Hybrid CR-PCR solver (CRPCR)
This is a wrapper function for the GPU CR-PCR kernel.
[out] | d_x | Solution vector |
[in] | d_a | Lower diagonal |
[in] | d_b | Main diagonal |
[in] | d_c | Upper diagonal |
[in] | d_d | Right hand side |
[in] | systemSizeOriginal | The size of the linear system |
[in] | numSystems | The number of systems to be solved |
CUDPPResult cudppTridiagonalDispatch | ( | void * | d_a, |
void * | d_b, | ||
void * | d_c, | ||
void * | d_d, | ||
void * | d_x, | ||
int | systemSize, | ||
int | numSystems, | ||
const CUDPPTridiagonalPlan * | plan | ||
) |
Dispatches the tridiagonal function based on the plan.
This is the dispatch call for the tridiagonal solver in either float or double datatype.
[out] | d_x | Solution vector |
[in] | d_a | Lower diagonal |
[in] | d_b | Main diagonal |
[in] | d_c | Upper diagonal |
[in] | d_d | Right hand side |
[in] | systemSize | The size of the linear system |
[in] | numSystems | The number of systems to be solved |
[in] | plan | pointer to CUDPPTridiagonalPlan |