Skip to content

Commit

Permalink
Merge pull request NVIDIA#1 from NVlabs/master
Browse files Browse the repository at this point in the history
Master
  • Loading branch information
dumerrill authored Oct 24, 2017
2 parents b165e1f + d622848 commit e6b5bf5
Show file tree
Hide file tree
Showing 45 changed files with 714 additions and 567 deletions.
84 changes: 48 additions & 36 deletions .cproject

Large diffs are not rendered by default.

22 changes: 21 additions & 1 deletion CHANGE_LOG.TXT
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
1.7.4 09/20/2017
- Bug fixes:
- Issue #114: Can't pair non-trivially-constructible values in radix sort
- Issue #115: WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32

//-----------------------------------------------------------------------------

1.7.3 08/28/2017
- Bug fixes:
- Issue #110: DeviceHistogram null-pointer exception bug for iterator inputs

//-----------------------------------------------------------------------------

1.7.2 08/26/2017
- Bug fixes:
- Issue #104: Device-wide reduction is now "run-to-run" deterministic for
pseudo-associative reduction operators (like floating point addition)

//-----------------------------------------------------------------------------

1.7.1 08/18/2017
- Updated Volta radix sorting tuning policies
- Bug fixes:
Expand Down Expand Up @@ -171,7 +191,7 @@
1.3.2 07/28/2014
- Bug fixes:
- Fix for cub::DeviceReduce where reductions of small problems
(small enough to only dispatch a single threadblock) would run in
(small enough to only dispatch a single thread block) would run in
the default stream (stream zero) regardless of whether an alternate
stream was specified.

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<hr>
<h3>About CUB</h3>

Current release: v1.7.1 (08/18/2017)
Current release: v1.7.4 (09/20/2017)

We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.

Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ struct AgentHistogram
((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel

// Whether rows are aligned and can be vectorized
if (quad_aligned_rows || pixel_aligned_rows)
if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
else
ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
Expand Down
10 changes: 7 additions & 3 deletions cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ struct AgentRadixSortDownsweep
ITEMS_PER_THREAD,
LOAD_ALGORITHM> BlockLoadValuesT;

// Value exchange array type
typedef ValueT ValueExchangeT[TILE_ITEMS];

/**
* Shared memory storage layout
Expand All @@ -187,7 +189,7 @@ struct AgentRadixSortDownsweep
OffsetT relative_bin_offsets[RADIX_DIGITS];
};

ValueT exchange_values[TILE_ITEMS];
Uninitialized<ValueExchangeT> exchange_values;

OffsetT exclusive_digit_prefix[RADIX_DIGITS];
};
Expand Down Expand Up @@ -276,18 +278,20 @@ struct AgentRadixSortDownsweep
{
CTA_SYNC();

ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();

#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
temp_storage.exchange_values[ranks[ITEM]] = values[ITEM];
exchange_values[ranks[ITEM]] = values[ITEM];
}

CTA_SYNC();

#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
ValueT value = temp_storage.exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];

if (FULL_TILE ||
(static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
Expand Down
136 changes: 23 additions & 113 deletions cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#include "../block/block_load.cuh"
#include "../block/block_reduce.cuh"
#include "../grid/grid_mapping.cuh"
#include "../grid/grid_queue.cuh"
#include "../grid/grid_even_share.cuh"
#include "../util_type.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
Expand All @@ -64,8 +63,7 @@ template <
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks
CacheLoadModifier _LOAD_MODIFIER> ///< Cache load modifier for reading input elements
struct AgentReducePolicy
{
enum
Expand All @@ -77,7 +75,6 @@ struct AgentReducePolicy

static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks
};


Expand Down Expand Up @@ -148,7 +145,6 @@ struct AgentReduce
struct _TempStorage
{
typename BlockReduceT::TempStorage reduce;
OffsetT dequeue_offset;
};

/// Alias wrapper allowing storage to be unioned
Expand Down Expand Up @@ -314,36 +310,35 @@ struct AgentReduce
*/
template <int CAN_VECTORIZE>
__device__ __forceinline__ OutputT ConsumeRange(
OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
OffsetT block_end, ///< [in] Threadblock end offset (exclusive)
GridEvenShare<OffsetT> &even_share, ///< GridEvenShare descriptor
Int2Type<CAN_VECTORIZE> can_vectorize) ///< Whether or not we can vectorize loads
{
OutputT thread_aggregate;

if (block_offset + TILE_ITEMS > block_end)
if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
{
// First tile isn't full (not all threads have valid items)
int valid_items = block_end - block_offset;
ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
int valid_items = even_share.block_end - even_share.block_offset;
ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
}

// At least one full block
ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
block_offset += TILE_ITEMS;
ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
even_share.block_offset += even_share.block_stride;

// Consume subsequent full tiles of input
while (block_offset + TILE_ITEMS <= block_end)
while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
{
ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
block_offset += TILE_ITEMS;
ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
even_share.block_offset += even_share.block_stride;
}

// Consume a partially-full tile
if (block_offset < block_end)
if (even_share.block_offset < even_share.block_end)
{
int valid_items = block_end - block_offset;
ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
int valid_items = even_share.block_end - even_share.block_offset;
ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
}

// Compute block-wide reduction (all threads have valid items)
Expand All @@ -358,113 +353,28 @@ struct AgentReduce
OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
OffsetT block_end) ///< [in] Threadblock end offset (exclusive)
{
GridEvenShare<OffsetT> even_share;
even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);

return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeRange(block_offset, block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(block_offset, block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
}


/**
* Reduce a contiguous segment of input tiles
*/
__device__ __forceinline__ OutputT ConsumeTiles(
OffsetT /*num_items*/, ///< [in] Total number of global input items
GridEvenShare<OffsetT> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<OffsetT> &/*queue*/, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_EVEN_SHARE> /*is_even_share*/) ///< [in] Marker type indicating this is an even-share mapping
GridEvenShare<OffsetT> &even_share) ///< [in] GridEvenShare descriptor
{
// Initialize even-share descriptor for this thread block
even_share.BlockInit();
// Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();

return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());

}


//---------------------------------------------------------------------
// Dynamically consume tiles
//---------------------------------------------------------------------

/**
* Dequeue and reduce tiles of items as part of a inter-block reduction
*/
template <int CAN_VECTORIZE>
__device__ __forceinline__ OutputT ConsumeTiles(
int num_items, ///< Total number of input items
GridQueue<OffsetT> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
Int2Type<CAN_VECTORIZE> can_vectorize) ///< Whether or not we can vectorize loads
{
// We give each thread block at least one tile of input.
OutputT thread_aggregate;
OffsetT block_offset = blockIdx.x * TILE_ITEMS;
OffsetT even_share_base = gridDim.x * TILE_ITEMS;

if (block_offset + TILE_ITEMS > num_items)
{
// First tile isn't full (not all threads have valid items)
int valid_items = num_items - block_offset;
ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
}

// Consume first full tile of input
ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());

if (num_items > even_share_base)
{
// Dequeue a tile of items
if (threadIdx.x == 0)
temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;

CTA_SYNC();

// Grab tile offset and check if we're done with full tiles
block_offset = temp_storage.dequeue_offset;

// Consume more full tiles
while (block_offset + TILE_ITEMS <= num_items)
{
ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);

CTA_SYNC();

// Dequeue a tile of items
if (threadIdx.x == 0)
temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;

CTA_SYNC();

// Grab tile offset and check if we're done with full tiles
block_offset = temp_storage.dequeue_offset;
}

// Consume partial tile
if (block_offset < num_items)
{
int valid_items = num_items - block_offset;
ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
}
}

// Compute block-wide reduction (all threads have valid items)
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);

}

/**
* Dequeue and reduce tiles of items as part of a inter-block reduction
*/
__device__ __forceinline__ OutputT ConsumeTiles(
OffsetT num_items, ///< [in] Total number of global input items
GridEvenShare<OffsetT> &/*even_share*/, ///< [in] GridEvenShare descriptor
GridQueue<OffsetT> &queue, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_DYNAMIC> /*is_dynamic*/) ///< [in] Marker type indicating this is a dynamic mapping
{
return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeTiles(num_items, queue, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeTiles(num_items, queue, Int2Type<false && ATTEMPT_VECTORIZATION>());
}

};
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ struct AgentReduceByKey
typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1];
typedef ValueOutputT ValueExchangeT[TILE_ITEMS + 1];

// Shared memory type for this threadblock
// Shared memory type for this thread block
union _TempStorage
{
struct
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ struct AgentRle

typedef LengthOffsetPair WarpAggregates[WARPS];

// Shared memory type for this threadblock
// Shared memory type for this thread block
struct _TempStorage
{
// Aliasable storage layout
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ struct AgentScan
ScanOpT>
RunningPrefixCallbackOp;

// Shared memory type for this threadblock
// Shared memory type for this thread block
union _TempStorage
{
typename BlockLoadT::TempStorage load; // Smem needed for tile loading
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_segment_fixup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ struct AgentSegmentFixup
ScanTileStateT>
TilePrefixCallbackOpT;

// Shared memory type for this threadblock
// Shared memory type for this thread block
union _TempStorage
{
struct
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ struct AgentSelectIf
// Item exchange type
typedef OutputT ItemExchangeT[TILE_ITEMS];

// Shared memory type for this threadblock
// Shared memory type for this thread block
union _TempStorage
{
struct
Expand Down
6 changes: 3 additions & 3 deletions cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

/**
* \file
* cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
* cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
*/

#pragma once
Expand All @@ -51,7 +51,7 @@ CUB_NS_PREFIX
namespace cub {

/**
* \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
* \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
* \ingroup BlockModule
*
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
Expand Down Expand Up @@ -398,7 +398,7 @@ public:
// Extract the local ranks of each key
for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
{
// Add in threadblock exclusive prefix
// Add in thread block exclusive prefix
ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
}
}
Expand Down
Loading

0 comments on commit e6b5bf5

Please sign in to comment.