Skip to content

Commit

Permalink
- Added new sorting interface that does not disturb input array
Browse files Browse the repository at this point in the history
- Update to ArgMin/Max functors for more efficient codegen
- Update to histogram interface to provide NPP/IPP-like functionality
- Remove all traces of 8B smem mode setting (which could cause
serializations in streams)

Former-commit-id: df811c0
  • Loading branch information
dumerrill committed Aug 11, 2014
1 parent 109e0f7 commit 7aa5005
Show file tree
Hide file tree
Showing 73 changed files with 2,661 additions and 2,399 deletions.
8 changes: 4 additions & 4 deletions CHANGE_LOG.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
1.2.3 04/01/2014
- Bug fixes:
- Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types
- Fixed code-snippet bug in ArgIndexInputIterator documentation
- Fixed code-snippet bug in ArgIndexInputIteratorT documentation

//-----------------------------------------------------------------------------

Expand Down Expand Up @@ -87,13 +87,13 @@
- Revised the operation of temporary tile status bookkeeping for DeviceScan (and similar) to be safe for current code run on future platforms (now uses proper fences)
- Fixed DeviceScan bug where Win32 alignment disagreements between host and device regarding user-defined data types would corrupt tile status
- Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block
- Added workaround for TexRefInputIterator to work with CUDA 6.0
- Added workaround for TexRefInputIteratorTto work with CUDA 6.0

//-----------------------------------------------------------------------------

1.1.1 12/11/2013
- New features:
- Added TexObjInputIterator, TexRefInputIterator, CacheModifiedInputIterator, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy. Compatible with Thrust API.
- Added TexObjInputIteratorT, TexRefInputIteratorT, CacheModifiedInputIteratorT, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy. Compatible with Thrust API.
- Added descending sorting to DeviceRadixSort and BlockRadixSort
- Added min, max, arg-min, and arg-max to DeviceReduce
- Added DeviceSelect (select-unique, select-if, and select-flagged)
Expand All @@ -103,7 +103,7 @@
- Performance
- Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
- Interface changes:
- Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces. The CacheModifiedInputIterator and CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
- Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces. The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
- Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
- Documentation and testing:
- Added simple examples of device-wide methods
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ __global__ void BlockSortKernel(int *d_in, int *d_out)
typename BlockStore::TempStorage store;
} temp_storage;

int block_offset = blockIdx.x * (128 * 16); // Offset for this block's ment
int block_offset = blockIdx.x * (128 * 16); // OffsetT for this block's ment

// Obtain a segment of 2048 consecutive keys that are blocked across threads
int thread_keys[16];
Expand Down
32 changes: 16 additions & 16 deletions cub/block/block_discontinuity.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ private:
struct ApplyOp
{
// Apply flag operator
static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
{
return flag_op(a, b, idx);
}
Expand All @@ -157,7 +157,7 @@ private:
struct ApplyOp<FlagOp, false>
{
// Apply flag operator
static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
{
return flag_op(a, b);
}
Expand All @@ -178,7 +178,7 @@ private:
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
flags[ITERATION] = ApplyOp<FlagOp>::Flag(
flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITERATION - 1],
input[ITERATION],
Expand All @@ -198,7 +198,7 @@ private:
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
flags[ITERATION] = ApplyOp<FlagOp>::Flag(
flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITERATION],
input[ITERATION + 1],
Expand Down Expand Up @@ -355,7 +355,7 @@ public:
// Set flag for first thread-item
head_flags[0] = (linear_tid == 0) ?
1 : // First thread
ApplyOp<FlagOp>::Flag(
ApplyOp<FlagOp>::FlagT(
flag_op,
temp_storage.last_items[linear_tid - 1],
input[0],
Expand Down Expand Up @@ -441,7 +441,7 @@ public:
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];

head_flags[0] = ApplyOp<FlagOp>::Flag(
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
predecessor_item,
input[0],
Expand Down Expand Up @@ -526,7 +526,7 @@ public:
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::Flag(
ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
Expand Down Expand Up @@ -613,7 +613,7 @@ public:
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];

tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
Expand Down Expand Up @@ -710,7 +710,7 @@ public:
// Set flag for first thread-item
head_flags[0] = (linear_tid == 0) ?
1 : // First thread
ApplyOp<FlagOp>::Flag(
ApplyOp<FlagOp>::FlagT(
flag_op,
temp_storage.last_items[linear_tid - 1],
input[0],
Expand All @@ -719,7 +719,7 @@ public:
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::Flag(
ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
Expand Down Expand Up @@ -818,7 +818,7 @@ public:
// Set flag for first thread-item
head_flags[0] = (linear_tid == 0) ?
1 : // First thread
ApplyOp<FlagOp>::Flag(
ApplyOp<FlagOp>::FlagT(
flag_op,
temp_storage.last_items[linear_tid - 1],
input[0],
Expand All @@ -829,7 +829,7 @@ public:
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];

tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
Expand Down Expand Up @@ -936,7 +936,7 @@ public:
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];

head_flags[0] = ApplyOp<FlagOp>::Flag(
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
predecessor_item,
input[0],
Expand All @@ -945,7 +945,7 @@ public:
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::Flag(
ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
Expand Down Expand Up @@ -1054,7 +1054,7 @@ public:
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];

head_flags[0] = ApplyOp<FlagOp>::Flag(
head_flags[0] = ApplyOp<FlagOp>::FlagT(
flag_op,
predecessor_item,
input[0],
Expand All @@ -1065,7 +1065,7 @@ public:
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];

tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
Expand Down
48 changes: 24 additions & 24 deletions cub/block/block_exchange.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -511,10 +511,10 @@ private:
/**
* Exchanges data items annotated by rank into <em>blocked</em> arrangement. Specialized for no timeslicing.
*/
template <typename Offset>
template <typename OffsetT>
__device__ __forceinline__ void ScatterToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
Int2Type<false> time_slicing)
{
#pragma unroll
Expand All @@ -539,10 +539,10 @@ private:
/**
* Exchanges data items annotated by rank into <em>blocked</em> arrangement. Specialized for warp-timeslicing.
*/
template <typename Offset>
template <typename OffsetT>
__device__ __forceinline__ void ScatterToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
Int2Type<true> time_slicing)
{
T temp_items[ITEMS_PER_THREAD];
Expand Down Expand Up @@ -591,10 +591,10 @@ private:
/**
* Exchanges data items annotated by rank into <em>striped</em> arrangement. Specialized for no timeslicing.
*/
template <typename Offset>
template <typename OffsetT>
__device__ __forceinline__ void ScatterToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
Int2Type<false> time_slicing)
{
#pragma unroll
Expand All @@ -620,10 +620,10 @@ private:
/**
* Exchanges data items annotated by rank into <em>striped</em> arrangement. Specialized for warp-timeslicing.
*/
template <typename Offset>
template <typename OffsetT>
__device__ __forceinline__ void ScatterToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
Int2Type<true> time_slicing)
{
T temp_items[ITEMS_PER_THREAD];
Expand Down Expand Up @@ -911,12 +911,12 @@ public:
* \par
* - \smemreuse
*
* \tparam Offset <b>[inferred]</b> Signed integer type for local offsets
* \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
*/
template <typename Offset>
template <typename OffsetT>
__device__ __forceinline__ void ScatterToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
Offset ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
ScatterToBlocked(items, ranks, Int2Type<WARP_TIME_SLICING>());
}
Expand All @@ -928,12 +928,12 @@ public:
* \par
* - \smemreuse
*
* \tparam Offset <b>[inferred]</b> Signed integer type for local offsets
* \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
*/
template <typename Offset>
template <typename OffsetT>
__device__ __forceinline__ void ScatterToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
Offset ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
ScatterToStriped(items, ranks, Int2Type<WARP_TIME_SLICING>());
}
Expand All @@ -945,12 +945,12 @@ public:
* \par
* - \smemreuse
*
* \tparam Offset <b>[inferred]</b> Signed integer type for local offsets
* \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
*/
template <typename Offset>
template <typename OffsetT>
__device__ __forceinline__ void ScatterToStripedGuarded(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
Offset ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
Expand Down Expand Up @@ -978,13 +978,13 @@ public:
* \par
* - \smemreuse
*
* \tparam Offset <b>[inferred]</b> Signed integer type for local offsets
* \tparam ValidFlag <b>[inferred]</b> Flag type denoting which items are valid
* \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
* \tparam ValidFlag <b>[inferred]</b> FlagT type denoting which items are valid
*/
template <typename Offset, typename ValidFlag>
template <typename OffsetT, typename ValidFlag>
__device__ __forceinline__ void ScatterToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity
{
#pragma unroll
Expand Down Expand Up @@ -1091,12 +1091,12 @@ public:
* \par
* - \smemreuse
*
* \tparam Offset <b>[inferred]</b> Signed integer type for local offsets
* \tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
*/
template <typename Offset>
template <typename OffsetT>
__device__ __forceinline__ void ScatterToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
Offset ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
Expand Down
18 changes: 9 additions & 9 deletions cub/block/block_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -285,10 +285,10 @@ public:
*
* \endcode
*
* \tparam HistoCounter <b>[inferred]</b> Histogram counter type
* \tparam HistoCounterT <b>[inferred]</b> Histogram counter type
*/
template <typename HistoCounter>
__device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
template <typename HistoCounterT>
__device__ __forceinline__ void InitHistogram(HistoCounterT histogram[BINS])
{
// Initialize histogram bin counts to zeros
int histo_offset = 0;
Expand Down Expand Up @@ -340,13 +340,13 @@ public:
*
* \endcode
*
* \tparam HistoCounter <b>[inferred]</b> Histogram counter type
* \tparam HistoCounterT <b>[inferred]</b> Histogram counter type
*/
template <
typename HistoCounter>
typename HistoCounterT>
__device__ __forceinline__ void Histogram(
T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
HistoCounterT histogram[BINS]) ///< [out] Reference to shared/global memory histogram
{
// Initialize histogram bin counts to zeros
InitHistogram(histogram);
Expand Down Expand Up @@ -397,13 +397,13 @@ public:
*
* \endcode
*
* \tparam HistoCounter <b>[inferred]</b> Histogram counter type
* \tparam HistoCounterT <b>[inferred]</b> Histogram counter type
*/
template <
typename HistoCounter>
typename HistoCounterT>
__device__ __forceinline__ void Composite(
T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
HistoCounterT histogram[BINS]) ///< [out] Reference to shared/global memory histogram
{
InternalBlockHistogram(temp_storage).Composite(items, histogram);
}
Expand Down
Loading

0 comments on commit 7aa5005

Please sign in to comment.