Skip to content

Commit

Permalink
Better fix for Issue NVIDIA#71 (KeyVairPair won't work if Key has non…
Browse files Browse the repository at this point in the history
…-trivial

ctor)

https://github.com/NVlabs/cub/issues/71

Doc fixes for Issue NVIDIA#70 (1.5.5 breaks BlockScan API)

https://github.com/NVlabs/cub/issues/70
Former-commit-id: eb1c1432ad3df4d29dc055d45f4ccc3b00c8b2ba
  • Loading branch information
dumerrill committed Nov 21, 2016
1 parent 7b34462 commit af70707
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 29 deletions.
23 changes: 18 additions & 5 deletions CHANGE_LOG.TXT
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
1.5.5 10/25/2016
1.6.3 11/20/2016
- Updated GP100 tuning policies for radix sorting (6.2B 32b keys/s)
- Bug fixes:
- Issue #74: Warpreduce executes reduction operator for out-of-bounds items
- Issue #72 (cub:InequalityWrapper::operator() should be non-const)
- Issue #71 (KeyVairPair won't work if Key has non-trivial ctor)
- Issue #68 (cub::TilePrefixCallbackOp::WarpReduce doesn't permit ptx
arch specialization)
- Issue #70 1.5.3 breaks BlockScan API. Retroactively reversioned
from v1.5.3 -> v1.6 to appropriately indicate API change.

//-----------------------------------------------------------------------------

1.6.2 (waws 1.5.5) 10/25/2016
- Updated Pascal tuning policies for radix sorting
- Bug fixes:
- Fix for arm64 compilation of caching allocator

//-----------------------------------------------------------------------------

1.5.4 10/14/2016
1.6.1 (was 1.5.4) 10/14/2016
- Bug fixes:
- Fix for radix sorting bug introduced by scan refactorization

//-----------------------------------------------------------------------------

1.5.3 10/11/2016
- Device/block/warp-wide exclusive scans now take an "initial value" (instead
1.6.0 (was 1.5.3) 10/11/2016
- API change: Device/block/warp-wide exclusive scans now take an "initial value" (instead
of an "identity value") for seeding the computation with an arbitrary prefix.
- Device-wide reductions and scans can now have input sequence types that are
- API change: Device-wide reductions and scans can now have input sequence types that are
differnet from output sequence types (as long as they are coercible)
value") for seeding the computation with an arbitrary prefix
- Reduce repository size (move doxygen binary to doc repository)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<hr>
<h3>About CUB</h3>

Current release: v1.5.5 (10/25/2016)
Current release: v1.6.0 (11/20/2016)

We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.

Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,7 @@ struct AgentSpmv
}
// Return the tile's running carry-out
KeyValuePairT tile_carry = {tile_num_rows, 0.0};
KeyValuePairT tile_carry(tile_num_rows, 0.0);
return tile_carry;
}
*/
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_spmv_row_based.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ struct AgentSpmv
OffsetT tile_nonzero_idx = temp_storage.tile_nonzero_idx;
OffsetT tile_nonzero_idx_end = temp_storage.tile_nonzero_idx_end;

KeyValuePairT tile_prefix = {0, 0.0};
KeyValuePairT tile_prefix(0, 0.0);
ReduceBySegmentOpT scan_op;
PrefixOpT prefix_op(tile_prefix, scan_op);

Expand Down
4 changes: 2 additions & 2 deletions cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ struct DeviceReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
OutputT initial_value = {1, Traits<InputValueT>::Max()}; // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
OutputT initial_value(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
d_temp_storage,
Expand Down Expand Up @@ -521,7 +521,7 @@ struct DeviceReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
OutputT initial_value = {1, Traits<InputValueT>::Lowest()}; // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
OutputT initial_value(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
d_temp_storage,
Expand Down
4 changes: 2 additions & 2 deletions cub/device/device_segmented_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ struct DeviceSegmentedReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
OutputT initial_value = {1, Traits<InputValueT>::Max()}; // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
OutputT initial_value(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent

return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
d_temp_storage,
Expand Down Expand Up @@ -560,7 +560,7 @@ struct DeviceSegmentedReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
OutputT initial_value = {1, Traits<InputValueT>::Lowest()}; // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
OutputT initial_value(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent

return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
d_temp_storage,
Expand Down
27 changes: 17 additions & 10 deletions cub/util_type.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -653,8 +653,8 @@ template <
typename _Key,
typename _Value
#if defined(_WIN32) && !defined(_WIN64)
, bool KeyIsLT = sizeof(_Key) < sizeof(_Value),
bool ValIsLT = sizeof(_Value) < sizeof(_Key)
, bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
, bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
#endif // #if defined(_WIN32) && !defined(_WIN64)
>
struct KeyValuePair
Expand Down Expand Up @@ -682,10 +682,17 @@ struct KeyValuePair

#if defined(_WIN32) && !defined(_WIN64)

// Need explicit padding to normalize value alignment and overall structure size
// because the Win32 host compiler (VC++) may disagree with CUDA device C++ compilers
// (EDG) on the member alignment and size of types passed as template parameters
// through kernel functions
/**
* Win32 won't do 16B alignment. This can present two problems for
* should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
* 1) If a smaller-aligned item were to be listed first, the host compiler places the
* should-be-16B item at too early an offset (and disagrees with device compiler)
* 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
* of the struct wrong (and disagrees with device compiler)
*
* So we put the larger-should-be-aligned item first, and explicitly pad the
* end of the struct
*/

/// Smaller key specialization
template <typename K, typename V>
Expand All @@ -694,9 +701,9 @@ struct KeyValuePair<K, V, true, false>
typedef K Key;
typedef V Value;

typedef char Pad[sizeof(Value) - sizeof(Key)];
typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];

Value value;
Value value; // Value has larger would-be alignment and goes first
Key key;
Pad pad;

Expand All @@ -723,9 +730,9 @@ struct KeyValuePair<K, V, false, true>
typedef K Key;
typedef V Value;

typedef char Pad[sizeof(Key) - sizeof(Value)];
typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];

Key key;
Key key; // Key has larger would-be alignment and goes first
Value value;
Pad pad;

Expand Down
18 changes: 11 additions & 7 deletions test/test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,9 @@ void Initialize(
int num_items)
{
for (int i = 0; i < num_items; ++i)
{
InitValue(gen_mode, h_in[i], i);
}

if (g_verbose_input)
{
Expand Down Expand Up @@ -926,8 +928,10 @@ void TestProblem(
GenMode gen_mode,
ReductionOpT reduction_op)
{
// Initialize host data
printf("\n\nInitializing %s->%s (gen mode %d)... ", typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
fflush(stdout);

// Initialize value data
InputT* h_in = new InputT[num_items];
Initialize(gen_mode, h_in, num_items);

Expand All @@ -936,8 +940,8 @@ void TestProblem(
InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);

// Initialize device data
int *d_segment_offsets = NULL;
InputT *d_in = NULL;
int *d_segment_offsets = NULL;
InputT *d_in = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
Expand Down Expand Up @@ -985,8 +989,8 @@ void TestByBackend(
GenMode gen_mode)
{
// Initialize host data
printf("\n\nInitializing %s -> %s (gen mode %d)... ",
typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);

InputT *h_in = new InputT[num_items];
int *h_segment_offsets = new int[max_segments + 1];
Expand Down Expand Up @@ -1160,7 +1164,7 @@ void TestType(
*/
int main(int argc, char** argv)
{
int max_items = 48000000;
int max_items = 27000000;
int max_segments = 34000;

// Initialize command line
Expand Down

0 comments on commit af70707

Please sign in to comment.