Classes
class	ScanTraits< T, Oper, backward, exclusive, multiRow, sums, fullBlock >
	Template class containing compile-time parameters to the scan functions. More...

class	SegmentedScanTraits< T, Oper, backward, exclusivity, doShiftFlags, fullBlock, sums, sm12OrBetter >
	Template class containing compile-time parameters to the segmented scan functions. More...

Compress Functions
template<class T , int depth>
__device__ void	binSearch_frag_mult (T keyArraySmem, T valueArraySmem, int offset, int &mid, T cmpValue, T testValue, int myAddress, int testGlobalIndex, T globalPointerArray, T globalStringArray, int bIndex, size_t numElements)

template<class T , int depth>
__device__ void	linearStringMerge (T searchArray, T pointerArray, T A_values, T myKey, T myAddress, int &index, T &cmpValue, T saveGlobalArray, T savePointerArray, T stringValues, int myStartIdxC, int myStartIdxA, int myStartIdxB, int localAPartSize, int localBPartSize, int localCPartSize, T localMaxB, T finalMaxB, T localMinB, int tid, int aIndex, int bIndex, int offset, int subPartitions, size_t numElements)

template<class T , int depth>
__device__ void	binSearch_fragment (T binArray, T pointerBinArray, int offset, int &mid, T cmpValue, T testValue, T myAddress, T globalStringArray, T globalStringArray2, size_t numElements)

template<class T , int depth>
__device__ void	lin_merge_simple (T &cmpValue, T myKey, T myAddress, int &index, T BKeys, T BValues, T stringValues, T A_values, T A_keys_out, T A_values_out, int myStartIdxA, int myStartIdxB, int myStartIdxC, T localMinB, T localMaxB, int aCont, int bCont, int totalSize, int sizePerPartition, int i, T *stringValues2, size_t numElements)

template<class T , int depth>
__device__ void	bin_search_block (T &cmpValue, T tmpVal, T in, T addressPad, const T stringVals, int &j, int bump, T stringVals2, size_t numElements)

template<class T , int depth>
__device__ void	lin_search_block (T &cmpValue, T &tmpVal, T in, T addressPad, const T stringVals, int &j, int offset, int last, int startAddress, int addPart, T stringVals2, size_t numElements)

template<class T >
__device__ void	compareSwapVal (T &A1, T &A2, const int index1, const int index2, T scratch, const T stringVals, T *stringVals2, size_t numElements)

__device__ void	BitArraySetBit (huffman_code *ba, unsigned int bit)

__device__ void	BitArrayShiftLeft (huffman_code *ba, unsigned int shifts)

__device__ void	BitArrayShiftRight (huffman_code *ba, unsigned int shifts)

__device__ int	FindMinimumCount (my_huffman_node_t *ht, int elements)

Merge Sort Functions
typedef unsigned int	uint

template<class T , int depth>
__device__ void	bin_search_block (T &cmpValue, T tmpVal, T *in, unsigned int &j, unsigned int bump, unsigned int addPart)
	Binary search within a single block (blockSort) More...

template<class T , int depth>
__device__ void	lin_search_block (T &cmpValue, T mVal, unsigned int &tmpVal, T in, unsigned int addressPad, unsigned int &j, unsigned int offset, unsigned int last, unsigned int startAddress, unsigned int addPart)
	Linear search within a single block (blockSort) More...

template<class T >
__device__ void	compareSwapVal (T &A1, T &A2, unsigned int &ref1, unsigned int &ref2)
	For blockSort. Compares two values and decides to swap if A1 > A2. More...

template<class T >
__device__ void	binSearch_fragment_lower (T *binArray, int offset, int &mid, T testValue)

template<class T >
__device__ void	binSearch_fragment_higher (T *binArray, int offset, int &mid, T testValue)

template<class T >
__device__ void	binSearch_whole_lower (T *BKeys, int &index, T myKey)

template<class T >
__device__ void	binSearch_whole_higher (T *BKeys, int &index, T myKey)

template<class T , int depth>
__device__ void	linearMerge_lower (T searchArray, T myKey, unsigned int myVal, int &index, T saveGlobalArray, unsigned int *saveValueArray, int myStartIdxC, T nextMaxB, int localAPartSize, int localBPartSize, T localMaxB, T localMinB, int aIndex, int bIndex, int offset)
	Performs a linear search in our shared memory (done after binary search). It merges the partition on the left side with the associated partition on the right side. More...

template<class T , int depth>
__device__ void	linearMerge_higher (T searchArray, T myKey, unsigned int myVal, int &index, T saveGlobalArray, unsigned int *saveValueArray, int myStartIdxC, T localMinB, T nextMaxB, int aIndex, int bIndex, int offset, int localAPartSize, int localBPartSize)
	Performs a linear search in our shared memory (done after binary search). It merges the partition on the right side with the associated partition on the left side. More...

#define	BLOCKSORT_SIZE 1024

#define	CTA_BLOCK 128

#define	DEPTH_simple 2

#define	DEPTH_multi 4

#define	CTASIZE_simple 256

#define	CTASIZE_multi 128

#define	INTERSECT_A_BLOCK_SIZE_simple DEPTH_simple*CTASIZE_simple

#define	INTERSECT_B_BLOCK_SIZE_simple 2DEPTH_simpleCTASIZE_simple

#define	INTERSECT_A_BLOCK_SIZE_multi DEPTH_multi*CTASIZE_multi

#define	INTERSECT_B_BLOCK_SIZE_multi 2DEPTH_multiCTASIZE_multi

Radix Sort Functions
typedef unsigned int	uint

template<bool doFlip>
__device__ uint	floatFlip (uint f)
	Flips bits of single-precision floating-point number (parameterized by doFlip) More...

template<bool doFlip>
__device__ uint	floatUnflip (uint f)
	Reverses bit-flip of single-precision floating-point number (parameterized by doFlip) More...

template<class T , int maxlevel>
__device__ T	scanwarp (T val, volatile T *sData)
	Scans one warp quickly, optimized for 32-element warps, using shared memory. More...

__device__ uint4	scan4 (uint4 idata)
	Scans 4*CTA_SIZE unsigned ints in a block. More...

template<int ctasize>
__device__ uint4	rank4 (uint4 preds)
	Computes output position for each thread given predicate; trues come first then falses. More...

template<uint nbits, uint startbit>
__device__ void	radixSortBlock (uint4 &key, uint4 &value)
	Sorts one block. More...

template<uint nbits, uint startbit>
__device__ void	radixSortBlockKeysOnly (uint4 &key)
	Sorts one block. Key-only version. More...

Rand Functions
__device__ void	swizzleShift (uint4 *f)
	Does a GLSL-style swizzle assigning f->xyzw = f->yzwx. More...

__device__ unsigned int	leftRotate (unsigned int x, unsigned int n)
	Rotates the bits in x over by n bits. More...

__device__ unsigned int	F (unsigned int x, unsigned int y, unsigned int z)
	The F scrambling function. More...

__device__ unsigned int	G (unsigned int x, unsigned int y, unsigned int z)
	The G scrambling function. More...

__device__ unsigned int	H (unsigned int x, unsigned int y, unsigned int z)
	The H scrambling function. More...

__device__ unsigned int	I (unsigned int x, unsigned int y, unsigned int z)
	The I scrambling function. More...

__device__ void	FF (uint4 td, int i, uint4 Fr, float p, unsigned int *data)
	The FF scrambling function. More...

__device__ void	GG (uint4 td, int i, uint4 Gr, float p, unsigned int *data)
	The GG scrambling function. More...

__device__ void	HH (uint4 td, int i, uint4 Hr, float p, unsigned int *data)
	The HH scrambling function. More...

__device__ void	II (uint4 td, int i, uint4 Ir, float p, unsigned int *data)
	The II scrambling function. More...

__device__ void	setupInput (unsigned int *input, unsigned int seed)
	Sets up the input array using information of seed, and threadIdx. More...

Scan Functions
template<class T , class traits >
__device__ void	loadSharedChunkFromMem4 (T s_out, T threadScan[2][4], const T d_in, int numElements, int iDataOffset, int &ai, int &bi, int &aiDev, int &biDev)
	Handles loading input s_data from global memory to shared memory (vec4 version) More...

template<class T , class traits >
__device__ void	storeSharedChunkToMem4 (T d_out, T threadScan[2][4], T s_in, int numElements, int oDataOffset, int ai, int bi, int aiDev, int biDev)
	Handles storing result s_data from shared memory to global memory (vec4 version) More...

template<class T , class traits >
__device__ void	loadSharedChunkFromMem2 (T s_out, T threadScan[2][2], const T d_in, int numElements, int iDataOffset, int &ai, int &bi, int &aiDev, int &biDev)
	Handles loading input s_data from global memory to shared memory (vec4 version) More...

template<class T , class traits >
__device__ void	storeSharedChunkToMem2 (T d_out, T threadScan[2][2], T s_in, int numElements, int oDataOffset, int ai, int bi, int aiDev, int biDev)
	Handles storing result s_data from shared memory to global memory (vec4 version) More...

template<class T , class traits , int maxlevel>
__device__ T	warpscan (T val, volatile T *s_data)
	Scan all warps of a CTA without synchronization. More...

template<class T , class traits >
__device__ void	scanWarps (T x, T y, T *s_data)
	Perform a full CTA scan using the warp-scan algorithm. More...

template<class T , class traits >
__device__ void	scanCTA (T s_data, T d_blockSums, unsigned int blockSumIndex)
	CTA-level scan routine; scans s_data in shared memory in each thread block. More...

#define	DISALLOW_LOADSTORE_OVERLAP 1

Segmented scan Functions
template<class T , typename traits >
__device__ void	loadForSegmentedScanSharedChunkFromMem4 (T s_odata, T threadScan0[4], T threadScan1[4], unsigned int &threadFlag, unsigned int s_oflags, unsigned int s_oindices, const T d_idata, const unsigned int *d_iflags, int numElements, int iDataOffset, int &ai, int &bi, int &aiDev, int &biDev)
	Handles loading input s_data from global memory to shared memory (vec4 version) More...

template<class T , class traits >
__device__ void	storeForSegmentedScanSharedChunkToMem4 (T d_odata, T threadScan0[4], T threadScan1[4], unsigned int threadFlag, T s_idata, unsigned int numElements, int oDataOffset, int ai, int bi, int aiDev, int biDev)
	Handles storing result s_data from shared memory to global memory (vec4 version) More...

template<class T , class traits , unsigned int blockSize>
__device__ T	reduceCTA (volatile T *s_data)

template<class T , class traits , bool isExclusive, unsigned int log_simd_threads>
__device__ void	warpSegScan (T val, unsigned int flag, volatile T s_data, volatile unsigned int s_flags, T &oVal, unsigned int &oFlag, bool print=false)

template<class T , class traits >
__device__ void	segmentedScanWarps (T val1, unsigned int flag1, T val2, unsigned int flag2, T s_data, unsigned int s_flags)

template<class T , class traits >
__device__ void	segmentedScanCTA (T s_data, unsigned int s_flags, unsigned int s_indices, T d_blockSums=0, unsigned int d_blockFlags=0, unsigned int d_blockIndices=0)
	CTA-level segmented scan routine;. More...

Merge Sort Functions
typedef unsigned int	uint

__device__ int	tie_break_simp (unsigned int myLoc, unsigned int cmpLoc, unsigned int myBound, unsigned int cmpBound, unsigned int myAdd, unsigned int cmpAdd, unsigned int *stringLoc, unsigned int stringSize, unsigned char termC)
	Breaks ties in keys (first four characters) returns true if cmpVal > myVal false otherwise. More...

template<class T , int depth>
__device__ void	bin_search_block_string (T &cmpValue, T tmpVal, T in, T addressPad, T *stringVals, int &j, int bump, int sizeRemain, unsigned int stringSize, unsigned char termC)
	Binary search within a single block (blockSort) More...

template<class T , int depth>
__device__ void	lin_search_block_string (T &cmpValue, T &tmpVal, T in, T addressPad, T *stringVals, int &j, int offset, int last, int startAddress, int stringSize, unsigned char termC)
	Linear search within a single block (blockSort) More...

template<class T >
__device__ void	compareSwapVal (T &A1, T &A2, const int index1, const int index2, T scratch, T stringVals, unsigned int size, unsigned char termC)
	For blockSort. Compares two values and decides to swap if A1 > A2. More...

template<class T , int depth>
__device__ void	binSearch_fragment (T keys, T address, int offset, int &mid, T cmpValue, T testValue, T myAddress, int myLoc, int cmpLoc, int myBound, int cmpBound, T *globalStringArray, int stringSize, unsigned char termC)
	Performs a binary search in our shared memory, with tie breaks for strings. More...

template<class T , int depth>
__device__ void	binSearch_frag_mult (T keyArraySmem, T valueArraySmem, int offset, int &mid, T cmpValue, T testValue, int myAddress, T *globalStringArray, int myStartIdxA, int myStartIdxB, int aIndex, int bIndex, int size, int stringSize, unsigned char termC)

template<class T , int depth>
__device__ void	lin_merge_simple (T &cmpValue, T myKey, T myAddress, int &index, T BKeys, T BValues, T stringValues, T A_keys, T A_values, T A_keys_out, T *A_values_out, int myStartIdxA, int myStartIdxB, int myStartIdxC, T localMinB, T localMaxB, int aCont, int bCont, int totalSize, int mySizeA, int mySizeB, unsigned int stringSize, int i, int stepNum, bool &placed, unsigned char termC)
	Performs a linear search in our shared memory (done after binary search), with tie breaks for strings. More...

template<class T , int depth>
__device__ void	linearStringMerge (T BKeys, T BValues, T myKey, T myAddress, bool &placed, int &index, T &cmpValue, T A_keys, T A_values, T A_keys_out, T A_values_out, T *stringValues, int myStartIdxC, int myStartIdxA, int myStartIdxB, int localAPartSize, int localBPartSize, int localCPartSize, T localMaxB, T localMinB, int tid, int aIndex, int bIndex, int i, int stringSize, int totalSize, unsigned char termC)
	Performs a linear search in our shared memory, used by multiMerge kernel. More...

#define	BLOCKSORT_SIZE 1024

#define	CTA_BLOCK 128

#define	DEPTH_simple 2

#define	DEPTH_multi 2

#define	CTASIZE_simple 256

#define	CTASIZE_multi 256

#define	INTERSECT_A_BLOCK_SIZE_simple DEPTH_simple*CTASIZE_simple

#define	INTERSECT_B_BLOCK_SIZE_simple 2DEPTH_simpleCTASIZE_simple

#define	INTERSECT_A_BLOCK_SIZE_multi DEPTH_multi*CTASIZE_multi

#define	INTERSECT_B_BLOCK_SIZE_multi 2DEPTH_multiCTASIZE_multi

Detailed Description

The CUDPP CTA-Level API contains functions that run on the GPU device. These are CUDA device functions that are called from within other CUDA device functions (typically CUDPP Kernel-Level API functions). They are called CTA-level functions because they typically process s_data "owned" by each CTA within shared memory, and are agnostic of any other CTAs that may be running (or how many CTAs are running), other than to compute appropriate global memory addresses.

Macro Definition Documentation

#define DISALLOW_LOADSTORE_OVERLAP 1

This is used to insert syncthreads to avoid perf loss caused by 128-bit load overlap that happens on G80. This gives about a 15% boost on scans on G80.

Todo:: Parameterize this in case this perf detail changes on future GPUs.

Function Documentation

template<class T , int depth>

__device__ void bin_search_block	(	T &	cmpValue,
		T	tmpVal,
		T *	in,
		unsigned int &	j,
		unsigned int	bump,
		unsigned int	addPart
	)

Binary search within a single block (blockSort)

Parameters

[in,out]	cmpValue	Value being considered from other partition
[in]	tmpVal	My Value
[in]	in	input keys
[in,out]	j	The index we are considering
[in]	bump	The offset we update by
[in]	addPart	Tie break (left partition vs right partition)

template<class T , int depth>

__device__ void lin_search_block	(	T &	cmpValue,
		T	mVal,
		unsigned int &	tmpVal,
		T *	in,
		unsigned int *	addressPad,
		unsigned int &	j,
		unsigned int	offset,
		unsigned int	last,
		unsigned int	startAddress,
		unsigned int	addPart
	)

Linear search within a single block (blockSort)

Parameters

[in,out]	cmpValue	Value being considered from other partition
[in]	mVal	Value in our partition
[in,out]	tmpVal	Temporary register which is used to store the final address after our search
[in]	in,addressPad,in	= keys and addressPad = values
[in]	j	index in B partition we are considering
[in]	offset	Since this is register packed, offset is the ith iteration of linear search
[in]	last	The end of partition B we are allowed to look upto
[in]	startAddress	The beginning of our partition
[in]	addPart	Tie break (left partition vs right partition)

template<class T >

__device__ void compareSwapVal	(	T &	A1,
		T &	A2,
		unsigned int &	ref1,
		unsigned int &	ref2
	)

For blockSort. Compares two values and decides to swap if A1 > A2.

Parameters

[in,out]	A1	First value being compared
[in,out]	A2	Second value being compared
[in,out]	ref1	Local address of A1
[in,out]	ref2	Local address of A2

template<class T , int depth>

__device__ void linearMerge_lower	(	T *	searchArray,
		T	myKey,
		unsigned int	myVal,
		int &	index,
		T *	saveGlobalArray,
		unsigned int *	saveValueArray,
		int	myStartIdxC,
		T	nextMaxB,
		int	localAPartSize,
		int	localBPartSize,
		T	localMaxB,
		T	localMinB,
		int	aIndex,
		int	bIndex,
		int	offset
	)

inline

Performs a linear search in our shared memory (done after binary search). It merges the partition on the left side with the associated partition on the right side.

Parameters

[in]	searchArray	Array of keys
[in]	myKey	Current key being considered
[in]	myVal	Associated value of key
[in,out]	index	Index in local B partition we are comparing with
[out]	saveGlobalArray	Array of Keys after merge is complete
[out]	saveValueArray	Array of values after merge is complete
[in]	myStartIdxC	Global starting index of both partitions being considered
[in]	nextMaxB	Minimum value in the partition NEXT to the one we are comparing against
[in]	localAPartSize	Size of the partition we are considering
[in]	localBPartSize	Size of the partition we are comparing against
[in]	localMaxB	Largest element in THIS partition we are comparing against
[in]	localMinB	Smallest element in THIS partition we are comparing against
[in]	aIndex	The first global index our block is considering (thread 0 key 0)
[in]	bIndex	The first global index our block is comparing against (value 0 in shared memory)
[in]	offset	Count of key this thread is considering (between 1 and depth)

template<class T , int depth>

__device__ void linearMerge_higher	(	T *	searchArray,
		T	myKey,
		unsigned int	myVal,
		int &	index,
		T *	saveGlobalArray,
		unsigned int *	saveValueArray,
		int	myStartIdxC,
		T	localMinB,
		T	nextMaxB,
		int	aIndex,
		int	bIndex,
		int	offset,
		int	localAPartSize,
		int	localBPartSize
	)

inline

Performs a linear search in our shared memory (done after binary search). It merges the partition on the right side with the associated partition on the left side.

Parameters

[in]	searchArray	Array of keys
[in]	myKey	Current key being considered
[in]	myVal	Associated value of key
[in,out]	index	Index in local B partition we are comparing with
[out]	saveGlobalArray	Array of Keys after merge is complete
[out]	saveValueArray	Array of values after merge is complete
[in]	myStartIdxC	Global starting index of both partitions being considered
[in]	localMinB	Smallest element in THIS partition we are comparing against
[in]	nextMaxB	Minimum value in the partition NEXT to the one we are comparing against
[in]	aIndex	The first global index our block is considering (thread 0 key 0)
[in]	bIndex	The first global index our block is comparing against (value 0 in shared memory)
[in]	offset	Count of key this thread is considering (between 1 and depth)
[in]	localAPartSize	Size of the partition we are considering
[in]	localBPartSize	Size of the partition we are comparing against

template<bool doFlip>

__device__ uint floatFlip ( uint f )

Flips bits of single-precision floating-point number (parameterized by doFlip)

flip a float for sorting finds SIGN of fp number. if it's 1 (negative float), it flips all bits if it's 0 (positive float), it flips the sign only

Parameters

[in] f floating-point input (passed as unsigned int)

Returns: uint that stores the flipped version of the input

See also: floatUnflip

template<bool doFlip>

__device__ uint floatUnflip ( uint f )

Reverses bit-flip of single-precision floating-point number (parameterized by doFlip)

flip a float back (invert FloatFlip) signed was flipped from above, so: if sign is 1 (negative), it flips the sign bit back if sign is 0 (positive), it flips all bits back

Parameters

[in] f floating-point input (passed as unsigned int)

Returns: uint that stores the unflipped version of the input

See also: floatFlip

template<class T , int maxlevel>

__device__ T scanwarp	(	T	val,
		volatile T *	sData
	)

Scans one warp quickly, optimized for 32-element warps, using shared memory.

Scans each warp in parallel ("warp-scan"), one element per thread. uses 2 numElements of shared memory per thread (64 numElements per warp)

Parameters

[in]	val	Elements per thread to scan
[in,out]	sData

Returns: Scanned input warp

__device__ uint4 scan4 ( uint4 idata )

Scans 4*CTA_SIZE unsigned ints in a block.

scan4 scans 4*CTA_SIZE numElements in a block (4 per thread), using a warp-scan algorithm

Parameters

[in] idata 4-vector of integers to scan

Returns: Scanned input 4-vector of integers

template<int ctasize>

__device__ uint4 rank4 ( uint4 preds )

Computes output position for each thread given predicate; trues come first then falses.

Rank is the core of the radix sort loop. Given a predicate, it computes the output position for each thread in an ordering where all True threads come first, followed by all False threads. This version handles 4 predicates per thread; hence, "rank4".

Parameters

[in] preds true/false values for each of the 4 elements in this thread

Returns: Output position for each thread

Todo:: is the description of "preds" correct?

template<uint nbits, uint startbit>

__device__ void radixSortBlock	(	uint4 &	key,
		uint4 &	value
	)

Sorts one block.

Uses rank to sort one bit at a time: Sorts a block according to bits startbit -> nbits + startbit

Parameters

[in,out]	key
[in,out]	value

Returns: Sorted key/value block with respect to startbit and nbits

template<uint nbits, uint startbit>

__device__ void radixSortBlockKeysOnly ( uint4 & key )

Sorts one block. Key-only version.

Uses rank to sort one bit at a time: Sorts a block according to bits startbit -> nbits + startbit

Parameters

[in,out] key

Returns: Sorted key-only block with respect to startbit and nbits

__device__ void swizzleShift ( uint4 * f )

Does a GLSL-style swizzle assigning f->xyzw = f->yzwx.

It does the equvalent of f->xyzw = f->yzwx since this functionality is in shading languages but not exposed in CUDA.

Parameters

[in] f the uint4 data type which will have its elements shifted. Passed in as pointer.

__device__ unsigned int leftRotate	(	unsigned int	x,
		unsigned int	n
	)

Rotates the bits in x over by n bits.

This is the equivalent of the ROTATELEFT operation as described in the MD5 working memo. It takes the bits in x and circular shifts it over by n bits.

For more information see: The MD5 Message-Digest Algorithm.

Parameters

[in]	x	the variable with the bits
[in]	n	the number of bits to shift left by.

Returns: Rotated input

__device__ unsigned int F	(	unsigned int	x,
		unsigned int	y,
		unsigned int	z
	)

The F scrambling function.

The F function in the MD5 technical memo scrambles three variables x, y, and z in the following way using bitwise logic:

(x & y) | ((~x) & z)

The resulting value is returned as an unsigned int.

For more information see: The MD5 Message-Digest Algorithm

Parameters

[in]	x	See the above formula
[in]	y	See the above formula
[in]	z	See the above formula

Returns: F(x, y, z)

See also: FF()

__device__ unsigned int G	(	unsigned int	x,
		unsigned int	y,
		unsigned int	z
	)

The G scrambling function.

The G function in the MD5 technical memo scrambles three variables x, y, and z in the following way using bitwise logic:

(x & z) | ((~z) & y)

The resulting value is returned as an unsigned int.

For more information see: The MD5 Message-Digest Algorithm

Parameters

[in]	x	See the above formula
[in]	y	See the above formula
[in]	z	See the above formula

Returns: G(x, y, z)

See also: GG()

__device__ unsigned int H	(	unsigned int	x,
		unsigned int	y,
		unsigned int	z
	)

The H scrambling function.

The H function in the MD5 technical memo scrambles three variables x, y, and z in the following way using bitwise logic:

(x ^ y ^ z)

The resulting value is returned as an unsigned int.

For more information see: The MD5 Message-Digest Algorithm

Parameters

[in]	x	See the above formula
[in]	y	See the above formula
[in]	z	See the above formula

Returns: H(x, y, z)

See also: HH()

__device__ unsigned int I	(	unsigned int	x,
		unsigned int	y,
		unsigned int	z
	)

The I scrambling function.

The I function in the MD5 technical memo scrambles three variables x, y, and z in the following way using bitwise logic:

(y ^ (x | ~z))

The resulting value is returned as an unsigned int.

For more information see: The MD5 Message-Digest Algorithm

Parameters

[in]	x	See the above formula
[in]	y	See the above formula
[in]	z	See the above formula

Returns: I(x, y, z)

See also: II()

__device__ void FF	(	uint4 *	td,
		int	i,
		uint4 *	Fr,
		float	p,
		unsigned int *	data
	)

The FF scrambling function.

The FF function in the MD5 technical memo is a wrapper for the F scrambling function as well as performing its own rotations using LeftRotate and swizzleShift. The variable td is the current scrambled digest which is passed along and scrambled using the current iteration i, the rotation information Fr, and the starting input data. p is kept as a constant of 2^32. The resulting value is stored in td.

For more information see: The MD5 Message-Digest Algorithm.

Parameters

[in,out]	td	The current value of the digest stored as an uint4.
[in]	i	The current iteration of the algorithm. This affects the values in data.
[in]	Fr	The current rotation order.
[in]	p	The constant 2^32.
[in]	data	The starting input to MD5. Padded from setupInput().

Returns: FF(input)

See also: F(); swizzleShift(); leftRotate(); setupInput()

__device__ void GG	(	uint4 *	td,
		int	i,
		uint4 *	Gr,
		float	p,
		unsigned int *	data
	)

The GG scrambling function.

The GG function in the MD5 technical memo is a wrapper for the G scrambling function as well as performing its own rotations using LeftRotate() and swizzleShift(). The variable td is the current scrambled digest which is passed along and scrambled using the current iteration i, the rotation information Gr, and the starting input data. p is kept as a constant of 2^32. The resulting value is stored in td.

For more information see: The MD5 Message-Digest Algorithm.

Parameters

[in,out]	td	The current value of the digest stored as an uint4.
[in]	i	The current iteration of the algorithm. This affects the values in data.
[in]	Gr	The current rotation order.
[in]	p	The constant 2^32.
[in]	data	The starting input to MD5. Padded from setupInput().

Returns: GG(input)

See also: G(); swizzleShift(); leftRotate(); setupInput()

__device__ void HH	(	uint4 *	td,
		int	i,
		uint4 *	Hr,
		float	p,
		unsigned int *	data
	)

The HH scrambling function.

The HH function in the MD5 technical memo is a wrapper for the H scrambling function as well as performing its own rotations using LeftRotate() and swizzleShift(). The variable td is the current scrambled digest which is passed along and scrambled using the current iteration i, the rotation information Hr, and the starting input data. p is kept as a constant of 2^32. The resulting value is stored in td.

For more information see: The MD5 Message-Digest Algorithm.

Parameters

[in,out]	td	The current value of the digest stored as an uint4.
[in]	i	The current iteration of the algorithm. This affects the values in data.
[in]	Hr	The current rotation order.
[in]	p	The constant 2^32.
[in]	data	The starting input to MD5. Padded from setupInput().

Returns: HH(input)

See also: H(); swizzleShift(); leftRotate(); setupInput()

__device__ void II	(	uint4 *	td,
		int	i,
		uint4 *	Ir,
		float	p,
		unsigned int *	data
	)

The II scrambling function.

The II function in the MD5 technical memo is a wrapper for the I scrambling function as well as performing its own rotations using LeftRotate() and swizzleShift(). The variable td is the current scrambled digest which is passed along and scrambled using the current iteration i, the rotation information Ir, and the starting input data. p is kept as a constant of 2^32. The resulting value is stored in td.

For more information see: The MD5 Message-Digest Algorithm.

Parameters

[in,out]	td	The current value of the digest stored as an uint4.
[in]	i	The current iteration of the algorithm. This affects the values in data.
[in]	Ir	The current rotation order.
[in]	p	The constant 2^32.
[in]	data	The starting input to MD5. Padded from setupInput().

Returns: II(input)

See also: I(); swizzleShift(); leftRotate(); setupInput()

__device__ void setupInput	(	unsigned int *	input,
		unsigned int	seed
	)

Sets up the input array using information of seed, and threadIdx.

This function sets up the input array using a combination of the current thread's id and the user supplied seed.

For more information see: The MD5 Message-Digest Algorithm.

Parameters

[out]	input	The array which will contain the initial values for all the scrambling functions.
[in]	seed	The user supplied seed as an unsigned int.

See also: FF(); GG(); HH(); II(); gen_randMD5()

template<class T , class traits >

__device__ void loadSharedChunkFromMem4	(	T *	s_out,
		T	threadScan[2][4],
		const T *	d_in,
		int	numElements,
		int	iDataOffset,
		int &	ai,
		int &	bi,
		int &	aiDev,
		int &	biDev
	)

Handles loading input s_data from global memory to shared memory (vec4 version)

Load a chunk of 8*blockDim.x elements from global memory into a shared memory array. Each thread loads two T4 elements (where T4 is, e.g. int4 or float4), computes the scan of those two vec4s in thread local arrays (in registers), and writes the two total sums of the vec4s into shared memory, where they will be cooperatively scanned with the other partial sums by all threads in the CTA.

Parameters

[out]	s_out	The output (shared) memory array
[out]	threadScan	Intermediate per-thread partial sums array (x2)
[in]	d_in	The input (device) memory array
[in]	numElements	The number of elements in the array being scanned
[in]	iDataOffset	the offset of the input array in global memory for this thread block
[out]	ai	The shared memory address for the thread's first element (returned for reuse)
[out]	bi	The shared memory address for the thread's second element (returned for reuse)
[out]	aiDev	The device memory address for this thread's first element (returned for reuse)
[out]	biDev	The device memory address for this thread's second element (returned for reuse)

template<class T , class traits >

__device__ void storeSharedChunkToMem4	(	T *	d_out,
		T	threadScan[2][4],
		T *	s_in,
		int	numElements,
		int	oDataOffset,
		int	ai,
		int	bi,
		int	aiDev,
		int	biDev
	)

Handles storing result s_data from shared memory to global memory (vec4 version)

Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory into a device memory array. Each thread stores reads two elements from shared memory, adds them to the intermediate sums computed in loadSharedChunkFromMem4(), and writes two T4 elements (where T4 is, e.g. int4 or float4) to global memory.

Parameters

[out]	d_out	The output (device) memory array
[in]	threadScan	Intermediate per-thread partial sums array (x2) (contents computed in loadSharedChunkFromMem4())
[in]	s_in	The input (shared) memory array
[in]	numElements	The number of elements in the array being scanned
[in]	oDataOffset	the offset of the output array in global memory for this thread block
[in]	ai	The shared memory address for the thread's first element (computed in loadSharedChunkFromMem4())
[in]	bi	The shared memory address for the thread's second element (computed in loadSharedChunkFromMem4())
[in]	aiDev	The device memory address for this thread's first element (computed in loadSharedChunkFromMem4())
[in]	biDev	The device memory address for this thread's second element (computed in loadSharedChunkFromMem4())

template<class T , class traits >

__device__ void loadSharedChunkFromMem2	(	T *	s_out,
		T	threadScan[2][2],
		const T *	d_in,
		int	numElements,
		int	iDataOffset,
		int &	ai,
		int &	bi,
		int &	aiDev,
		int &	biDev
	)

Handles loading input s_data from global memory to shared memory (vec4 version)

Load a chunk of 8*blockDim.x elements from global memory into a shared memory array. Each thread loads two T4 elements (where T4 is, e.g. int4 or float4), computes the scan of those two vec4s in thread local arrays (in registers), and writes the two total sums of the vec4s into shared memory, where they will be cooperatively scanned with the other partial sums by all threads in the CTA.

Parameters

[out]	s_out	The output (shared) memory array
[out]	threadScan	Intermediate per-thread partial sums array (x2)
[in]	d_in	The input (device) memory array
[in]	numElements	The number of elements in the array being scanned
[in]	iDataOffset	the offset of the input array in global memory for this thread block
[out]	ai	The shared memory address for the thread's first element (returned for reuse)
[out]	bi	The shared memory address for the thread's second element (returned for reuse)
[out]	aiDev	The device memory address for this thread's first element (returned for reuse)
[out]	biDev	The device memory address for this thread's second element (returned for reuse)

template<class T , class traits >

__device__ void storeSharedChunkToMem2	(	T *	d_out,
		T	threadScan[2][2],
		T *	s_in,
		int	numElements,
		int	oDataOffset,
		int	ai,
		int	bi,
		int	aiDev,
		int	biDev
	)

Handles storing result s_data from shared memory to global memory (vec4 version)

Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory into a device memory array. Each thread stores reads two elements from shared memory, adds them to the intermediate sums computed in loadSharedChunkFromMem4(), and writes two T4 elements (where T4 is, e.g. int4 or float4) to global memory.

Parameters

[out]	d_out	The output (device) memory array
[in]	threadScan	Intermediate per-thread partial sums array (x2) (contents computed in loadSharedChunkFromMem4())
[in]	s_in	The input (shared) memory array
[in]	numElements	The number of elements in the array being scanned
[in]	oDataOffset	the offset of the output array in global memory for this thread block
[in]	ai	The shared memory address for the thread's first element (computed in loadSharedChunkFromMem4())
[in]	bi	The shared memory address for the thread's second element (computed in loadSharedChunkFromMem4())
[in]	aiDev	The device memory address for this thread's first element (computed in loadSharedChunkFromMem4())
[in]	biDev	The device memory address for this thread's second element (computed in loadSharedChunkFromMem4())

template<class T , class traits , int maxlevel>

__device__ T warpscan	(	T	val,
		volatile T *	s_data
	)

Scan all warps of a CTA without synchronization.

The warp-scan algorithm breaks a block of data into warp-sized chunks, and scans the chunks independently with a warp of threads each. Because warps execute instructions in SIMD fashion, there is no need to synchronize in order to share data within a warp (only across warps). Also, in SIMD the most efficient algorithm is a step-efficient algorithm. Therefore, within each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the work-efficient tree-based algorithm described by Guy Blelloch [1990] that takes 2 * log(N) steps and is in general more complex to implement. Previous versions of CUDPP used the Blelloch algorithm. For current GPUs, the warp size is 32, so this takes five steps per warp.

Each thread is responsible for a single element of the array to be scanned. Each thread inputs a single value to the scan via val and returns its own scanned result element. The threads of each warp cooperate via the shared memory array s_data to scan WARP_SIZE elements.

Template parameter maxlevel allows this warpscan to be performed on partial warps. For example, if only the first 8 elements of each warp need to be scanned, then warpscan only performs log2(8)=3 steps rather than 5.

The computation uses 2 * WARP_SIZE elements of shared memory per warp to enable warps to offset beyond their input data and receive the identity element without using any branch instructions.

Note: s_data is declared volatile here to prevent the compiler from optimizing away writes to shared memory, and ensure correct intrawarp communication in the absence of __syncthreads.

Returns: The result of the warp scan for the current thread

Parameters

[in]	val	The current threads's input to the scan
[in,out]	s_data	A pointer to a temporary shared array of 2*CTA_SIZE elements used to compute the warp scans

template<class T , class traits >

__device__ void scanWarps	(	T	x,
		T	y,
		T *	s_data
	)

Perform a full CTA scan using the warp-scan algorithm.

As described in the comment for warpscan(), the warp-scan algorithm breaks a block of data into warp-sized chunks, and scans the chunks independently with a warp of threads each. To complete the scan, each warp j then writes its last element to element j of a temporary shared array. Then a single warp exclusive-scans these "warp sums". Finally, each thread adds the result of the warp sum scan to the result of the scan from the first pass.

Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan twice.

Parameters

x	The first input value for the current thread
y	The second input value for the current thread
s_data	Temporary shared memory space of 2*CTA_SIZE elements for performing the scan

template<class T , class traits >

__device__ void scanCTA	(	T *	s_data,
		T *	d_blockSums,
		unsigned int	blockSumIndex
	)

CTA-level scan routine; scans s_data in shared memory in each thread block.

This function is the main CTA-level scan function. It may be called by other CUDA global or device functions. This function scans 2 * CTA_SIZE elements. Each thread is responsible for one element in each half of the input array.

Note: This code is intended to be run on a CTA of 128 threads. Other sizes are untested.

Parameters

[in]	s_data	The array to be scanned in shared memory
[out]	d_blockSums	Array of per-block sums
[in]	blockSumIndex	Location in d_blockSums to which to write this block's sum

template<class T , typename traits >

__device__ void loadForSegmentedScanSharedChunkFromMem4	(	T *	s_odata,
		T	threadScan0[4],
		T	threadScan1[4],
		unsigned int &	threadFlag,
		unsigned int *	s_oflags,
		unsigned int *	s_oindices,
		const T *	d_idata,
		const unsigned int *	d_iflags,
		int	numElements,
		int	iDataOffset,
		int &	ai,
		int &	bi,
		int &	aiDev,
		int &	biDev
	)

inline

Handles loading input s_data from global memory to shared memory (vec4 version)

Load a chunk of 8*blockDim.x elements from global memory into a shared memory array. Each thread loads two T4 elements (where T4 is, e.g. int4 or float4), computes the segmented scan of those two vec4s in thread local arrays (in registers), and writes the two total sums of the vec4s into shared memory, where they will be cooperatively scanned with the other partial sums by all threads in the CTA.

Parameters

[out]	s_odata	The output (shared) memory array
[out]	threadScan0	Intermediate per-thread partial sums array 1
[out]	threadScan1	Intermediate per-thread partial sums array 2
[out]	threadFlag	Intermediate array which holds 8 flags as follows Temporary register threadFlag0[4] - the flags for the first 4 elements read Temporary register threadFlag1[4] - the flags for the second 4 elements read Temporary register threadScanFlag0[4] - the inclusive OR-scan for the flags in threadFlag0[4] Temporary register threadScanFlag1[4] - the inclusive OR-scan for the flags in threadFlag1[4] We storing the 16 flags 32 bits of threadFlag Bits 0...3 contains threadFlag0[0]...threadFlag0[3] Bits 4...7 contains threadFlag1[0]...threadFlag1[3] Bits 8...11 contains threadScanFlag0[0]...threadScanFlag0[3] Bits 11...15 contains threadScanFlag1[0]...threadScanFlag1[3]
[out]	s_oflags	Output (shared) memory array of segment head flags
[out]	s_oindices	Output (shared) memory array of indices. If a flag for a position (1-based) is set then index for that position is the position, 0 otherwise.
[in]	d_idata	The input (device) memory array
[in]	d_iflags	The input (device) memory array of segment head flags
[in]	numElements	The number of elements in the array being scanned
[in]	iDataOffset	the offset of the input array in global memory for this thread block
[out]	ai	The shared memory address for the thread's first element (returned for reuse)
[out]	bi	The shared memory address for the thread's second element (returned for reuse)
[out]	aiDev	The device memory address for this thread's first element (returned for reuse)
[out]	biDev	The device memory address for this thread's second element (returned for reuse)

template<class T , class traits >

__device__ void storeForSegmentedScanSharedChunkToMem4	(	T *	d_odata,
		T	threadScan0[4],
		T	threadScan1[4],
		unsigned int	threadFlag,
		T *	s_idata,
		unsigned int	numElements,
		int	oDataOffset,
		int	ai,
		int	bi,
		int	aiDev,
		int	biDev
	)

inline

Handles storing result s_data from shared memory to global memory (vec4 version)

Store a chunk of 8*blockDim.x elements from shared memory into a device memory array. Each thread stores reads two elements from shared memory, adds them while respecting segment bouldaries, to the intermediate sums computed in loadForSegmentedScanSharedChunkFromMem4(), and writes two T4 elements (where T4 is, e.g. int4 or float4) to global memory.

Parameters

[out]	d_odata	The output (device) memory array
[out]	threadScan0	Intermediate per-thread partial sums array 1 (contents computed in loadForSegmentedScanSharedChunkFromMem4())
[in]	threadScan1	Intermediate per-thread partial sums array 2 (contents computed in loadForSegmentedScanSharedChunkFromMem4())
[in]	threadFlag	Various flags that loadForSegmentedScanSharedChunkFromMem4() needs to pass
[in]	s_idata	The input (shared) memory array
[in]	numElements	The number of elements in the array being scanned
[in]	oDataOffset	the offset of the output array in global memory for this thread block
[in]	ai	The shared memory address for the thread's first element (computed in loadForSegmentedScanSharedChunkFromMem4())
[in]	bi	The shared memory address for the thread's second element (computed in loadForSegmentedScanSharedChunkFromMem4())
[in]	aiDev	The device memory address for this thread's first element (computed in loadForSegmentedScanSharedChunkFromMem4())
[in]	biDev	The device memory address for this thread's second element (computed in loadForSegmentedScanSharedChunkFromMem4())

template<class T , class traits >

__device__ void segmentedScanCTA	(	T *	s_data,
		unsigned int *	s_flags,
		unsigned int *	s_indices,
		T *	d_blockSums = `0`,
		unsigned int *	d_blockFlags = `0`,
		unsigned int *	d_blockIndices = `0`
	)

CTA-level segmented scan routine;.

Performs segmented scan on s_data in shared memory in each thread block with head flags in s_flags (s_tflags is a read-write copy of the head flags which are modified).

This function is the main CTA-level segmented scan function. It may be called by other CUDA global or device functions.

Note: This code is intended to be run on a CTA of 128 threads. Other sizes are untested.

Parameters

[in]	s_data	Array to be scanned in shared memory
[in]	s_flags	Read-only version of flags in shared memory
[in]	s_indices	Temporary read-write indices array
[out]	d_blockSums	Array of per-block sums
[out]	d_blockFlags	Array of per-block OR-reduction of flags
[out]	d_blockIndices	Array of per-block min-reduction of indices

__device__ int tie_break_simp	(	unsigned int	myLoc,
		unsigned int	cmpLoc,
		unsigned int	myBound,
		unsigned int	cmpBound,
		unsigned int	myAdd,
		unsigned int	cmpAdd,
		unsigned int *	stringLoc,
		unsigned int	stringSize,
		unsigned char	termC
	)

Breaks ties in keys (first four characters) returns true if cmpVal > myVal false otherwise.

Parameters

[in]	myLoc,cmpLoc	Location of the two inputs
[in]	myBound,cmpBound	Local memory bounds for the two addresses
[in]	myAdd	Address into global memory of our current value
[in]	cmpAdd	Address into global memory of the value we are comparing against
[in]	stringLoc	Global memory array (input string)
[in]	stringSize	Size of our input string
[in]	termC	Termination character for our strings

Returns: Returns 1 if cmpVal > myVal 0 otherwise

template<class T , int depth>

__device__ void bin_search_block_string	(	T &	cmpValue,
		T	tmpVal,
		T *	in,
		T *	addressPad,
		T *	stringVals,
		int &	j,
		int	bump,
		int	sizeRemain,
		unsigned int	stringSize,
		unsigned char	termC
	)

Binary search within a single block (blockSort)

Parameters

[in,out]	cmpValue	Value being considered from other partition
[in]	tmpVal	My Value
[in]	in	input keys
[in]	addressPad	addresses of string locations in case of tie breaks
[in]	stringVals	global string array used to break ties
[in,out]	j	The index we are considering
[in]	bump	The offset we update by
[in]	sizeRemain	Size of our block (if it's smaller than blockSize)
[in]	stringSize	Size of our global string array (for tie breaks)
[in]	termC	Termination character for our strings

template<class T , int depth>

__device__ void lin_search_block_string	(	T &	cmpValue,
		T &	tmpVal,
		T *	in,
		T *	addressPad,
		T *	stringVals,
		int &	j,
		int	offset,
		int	last,
		int	startAddress,
		int	stringSize,
		unsigned char	termC
	)

Linear search within a single block (blockSort)

Parameters

[in,out]	cmpValue	Value being considered from other partition
[in,out]	tmpVal	Temporary register which is used initially to compare our value, and then to store the final address after our search
[in]	in,addressPad,stringVals	in = keys, addressPad = values, stringVals = global string array for tie breaks
[in]	j	index in B partition we are considering
[in]	offset	Since this is register packed, offset is the ith iteration of linear search
[in]	last	The end of partition B we are allowed to look upto
[in]	startAddress	The beginning of our partition
[in]	stringSize	Size of our global string array
[in]	termC	Termination character for our strings

template<class T >

__device__ void compareSwapVal	(	T &	A1,
		T &	A2,
		const int	index1,
		const int	index2,
		T *	scratch,
		T *	stringVals,
		unsigned int	size,
		unsigned char	termC
	)

For blockSort. Compares two values and decides to swap if A1 > A2.

Parameters

[in,out]	A1	First value being compared
[in,out]	A2	Second value being compared
[in]	index1	Local address of A1
[in]	index2	Local address of A2
[in,out]	scratch	Scratch memory storing the addresses
[in]	stringVals	String Values for tie breaks
[in]	size	size of our array
[in]	termC	Termination character for our strings

template<class T , int depth>

__device__ void binSearch_fragment	(	T *	keys,
		T *	address,
		int	offset,
		int &	mid,
		T	cmpValue,
		T	testValue,
		T	myAddress,
		int	myLoc,
		int	cmpLoc,
		int	myBound,
		int	cmpBound,
		T *	globalStringArray,
		int	stringSize,
		unsigned char	termC
	)

Performs a binary search in our shared memory, with tie breaks for strings.

Parameters

[in]	keys,address	Keys and address from our array
[in]	offset,mid	The current "middle" we are searching and the offset we will move to next
[in]	cmpValue,testValue	testValue is the value we are searching for from array A, cmpValue the value we have currently in B
[in]	myAddress,myLoc,cmpLoc,myBound,cmpBound	Same values from tie_break_simp which will be passed along
[in]	globalStringArray,stringSize	Our string array for breaking ties, and stringSize so we don't go out of bounds
[in]	termC	Termination character for our strings

template<class T , int depth>

__device__ void lin_merge_simple	(	T &	cmpValue,
		T	myKey,
		T	myAddress,
		int &	index,
		T *	BKeys,
		T *	BValues,
		T *	stringValues,
		T *	A_keys,
		T *	A_values,
		T *	A_keys_out,
		T *	A_values_out,
		int	myStartIdxA,
		int	myStartIdxB,
		int	myStartIdxC,
		T	localMinB,
		T	localMaxB,
		int	aCont,
		int	bCont,
		int	totalSize,
		int	mySizeA,
		int	mySizeB,
		unsigned int	stringSize,
		int	i,
		int	stepNum,
		bool &	placed,
		unsigned char	termC
	)

Performs a linear search in our shared memory (done after binary search), with tie breaks for strings.

Parameters

[in,out]	cmpValue	The current value we are looking at in our B array
[in]	myKey,myAddress	Keys and address from our array
[in]	index	Current index we are considering in our B array
[in]	BKeys,BValues	Keys and Addresses for array B
[in,out]	stringValues,A_keys,A_values,A_keys_out,A_values_out	Global arrays for our strings, keys, values
[in]	myStartIdxA,myStartIdxB,myStartIdxC	Beginning indices for our partitions
[in]	localMinB,localMaxB	The minimum and maximum values in our B partition
[in]	aCont,bCont,totalSize,mySizeA,mySizeB,stringSize	Address bounds and calculation helpers
[in]	i	The index of the local element we are merging
[in]	stepNum	Debug helper
[in]	placed	Whether value has been placed yet or not
[in]	termC	Termination character for our strings

template<class T , int depth>

__device__ void linearStringMerge	(	T *	BKeys,
		T *	BValues,
		T	myKey,
		T	myAddress,
		bool &	placed,
		int &	index,
		T &	cmpValue,
		T *	A_keys,
		T *	A_values,
		T *	A_keys_out,
		T *	A_values_out,
		T *	stringValues,
		int	myStartIdxC,
		int	myStartIdxA,
		int	myStartIdxB,
		int	localAPartSize,
		int	localBPartSize,
		int	localCPartSize,
		T	localMaxB,
		T	localMinB,
		int	tid,
		int	aIndex,
		int	bIndex,
		int	i,
		int	stringSize,
		int	totalSize,
		unsigned char	termC
	)

Performs a linear search in our shared memory, used by multiMerge kernel.

Parameters

[in]	BKeys,BValues	Keys and Addresses for array B
[in]	myKey,myAddress	Keys and address from our array
[in]	placed	Whether value has been placed yet or not
[in]	index	Current index we are considering in our B array
[in,out]	cmpValue	The current value we are looking at in our B array
[in,out]	stringValues,A_keys,A_values,A_keys_out,A_values_out	Global arrays for our strings, keys, values
[in]	myStartIdxA,myStartIdxB,myStartIdxC	Beginning indices for our partitions
[in]	localAPartSize,localBPartSize,localCPartSize	Array of partition sizes for our inputs and outputs
[in]	localMinB,localMaxB	The minimum and maximum values in our B partition
[in]	tid	thread ID
[in]	aIndex,bIndex,totalSize,stringSize	Address bounds and calculation helpers
[in]	i	The index of the local element we are merging
[in]	termC	Termination character for our strings

Classes

Compress Functions

Merge Sort Functions

Radix Sort Functions

Rand Functions

Scan Functions

Segmented scan Functions

Merge Sort Functions

Detailed Description

Macro Definition Documentation

Function Documentation