diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT index 00d9cb05b6..7b146ef1da 100644 --- a/CHANGE_LOG.TXT +++ b/CHANGE_LOG.TXT @@ -1,20 +1,33 @@ -1.5.5 10/25/2016 +1.6.3 11/20/2016 + - Updated GP100 tuning policies for radix sorting (6.2B 32b keys/s) + - Bug fixes: + - Issue #74: Warpreduce executes reduction operator for out-of-bounds items + - Issue #72 (cub:InequalityWrapper::operator() should be non-const) + - Issue #71 (KeyVairPair won't work if Key has non-trivial ctor) + - Issue #68 (cub::TilePrefixCallbackOp::WarpReduce doesn't permit ptx + arch specialization) + - Issue #70 1.5.3 breaks BlockScan API. Retroactively reversioned + from v1.5.3 -> v1.6 to appropriately indicate API change. + +//----------------------------------------------------------------------------- + +1.6.2 (waws 1.5.5) 10/25/2016 - Updated Pascal tuning policies for radix sorting - Bug fixes: - Fix for arm64 compilation of caching allocator //----------------------------------------------------------------------------- -1.5.4 10/14/2016 +1.6.1 (was 1.5.4) 10/14/2016 - Bug fixes: - Fix for radix sorting bug introduced by scan refactorization //----------------------------------------------------------------------------- -1.5.3 10/11/2016 - - Device/block/warp-wide exclusive scans now take an "initial value" (instead +1.6.0 (was 1.5.3) 10/11/2016 + - API change: Device/block/warp-wide exclusive scans now take an "initial value" (instead of an "identity value") for seeding the computation with an arbitrary prefix. - - Device-wide reductions and scans can now have input sequence types that are + - API change: Device-wide reductions and scans can now have input sequence types that are differnet from output sequence types (as long as they are coercible) value") for seeding the computation with an arbitrary prefix - Reduce repository size (move doxygen binary to doc repository) diff --git a/README.md b/README.md index 495d96f5d3..8b8313ca71 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@

About CUB

-Current release: v1.5.5 (10/25/2016) +Current release: v1.6.0 (11/20/2016) We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples. diff --git a/cub/agent/agent_spmv_orig.cuh b/cub/agent/agent_spmv_orig.cuh index 0babd7b77e..37871444c9 100644 --- a/cub/agent/agent_spmv_orig.cuh +++ b/cub/agent/agent_spmv_orig.cuh @@ -748,7 +748,7 @@ struct AgentSpmv } // Return the tile's running carry-out - KeyValuePairT tile_carry = {tile_num_rows, 0.0}; + KeyValuePairT tile_carry(tile_num_rows, 0.0); return tile_carry; } */ diff --git a/cub/agent/agent_spmv_row_based.cuh b/cub/agent/agent_spmv_row_based.cuh index 975903cb24..c5820112ec 100644 --- a/cub/agent/agent_spmv_row_based.cuh +++ b/cub/agent/agent_spmv_row_based.cuh @@ -430,7 +430,7 @@ struct AgentSpmv OffsetT tile_nonzero_idx = temp_storage.tile_nonzero_idx; OffsetT tile_nonzero_idx_end = temp_storage.tile_nonzero_idx_end; - KeyValuePairT tile_prefix = {0, 0.0}; + KeyValuePairT tile_prefix(0, 0.0); ReduceBySegmentOpT scan_op; PrefixOpT prefix_op(tile_prefix, scan_op); diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh index 1983b57b59..080d1b898a 100644 --- a/cub/device/device_reduce.cuh +++ b/cub/device/device_reduce.cuh @@ -373,7 +373,7 @@ struct DeviceReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - OutputT initial_value = {1, Traits::Max()}; // replace with std::numeric_limits::max() when C++11 support is more prevalent + OutputT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent return DispatchReduce::Dispatch( d_temp_storage, @@ -521,7 +521,7 @@ struct DeviceReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - OutputT initial_value = {1, Traits::Lowest()}; // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + OutputT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent return DispatchReduce::Dispatch( d_temp_storage, diff --git a/cub/device/device_segmented_reduce.cuh b/cub/device/device_segmented_reduce.cuh index 0dd5c14b78..07b05e9310 100644 --- a/cub/device/device_segmented_reduce.cuh +++ b/cub/device/device_segmented_reduce.cuh @@ -390,7 +390,7 @@ struct DeviceSegmentedReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - OutputT initial_value = {1, Traits::Max()}; // replace with std::numeric_limits::max() when C++11 support is more prevalent + OutputT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent return DispatchSegmentedReduce::Dispatch( d_temp_storage, @@ -560,7 +560,7 @@ struct DeviceSegmentedReduce ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - OutputT initial_value = {1, Traits::Lowest()}; // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + OutputT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent return DispatchSegmentedReduce::Dispatch( d_temp_storage, diff --git a/cub/util_type.cuh b/cub/util_type.cuh index 138f9b803f..95bdfdc9e4 100644 --- a/cub/util_type.cuh +++ b/cub/util_type.cuh @@ -653,8 +653,8 @@ template < typename _Key, typename _Value #if defined(_WIN32) && !defined(_WIN64) - , bool KeyIsLT = sizeof(_Key) < sizeof(_Value), - bool ValIsLT = sizeof(_Value) < sizeof(_Key) + , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES) + , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES) #endif // #if defined(_WIN32) && !defined(_WIN64) > struct KeyValuePair @@ -682,10 +682,17 @@ struct KeyValuePair #if defined(_WIN32) && !defined(_WIN64) -// Need explicit padding to normalize value alignment and overall structure size -// because the Win32 host compiler (VC++) may disagree with CUDA device C++ compilers -// (EDG) on the member alignment and size of types passed as template parameters -// through kernel functions +/** + * Win32 won't do 16B alignment. This can present two problems for + * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members: + * 1) If a smaller-aligned item were to be listed first, the host compiler places the + * should-be-16B item at too early an offset (and disagrees with device compiler) + * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size + * of the struct wrong (and disagrees with device compiler) + * + * So we put the larger-should-be-aligned item first, and explicitly pad the + * end of the struct + */ /// Smaller key specialization template @@ -694,9 +701,9 @@ struct KeyValuePair typedef K Key; typedef V Value; - typedef char Pad[sizeof(Value) - sizeof(Key)]; + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; - Value value; + Value value; // Value has larger would-be alignment and goes first Key key; Pad pad; @@ -723,9 +730,9 @@ struct KeyValuePair typedef K Key; typedef V Value; - typedef char Pad[sizeof(Key) - sizeof(Value)]; + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; - Key key; + Key key; // Key has larger would-be alignment and goes first Value value; Pad pad; diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu index 6f88022b37..edc06c5117 100644 --- a/test/test_device_reduce.cu +++ b/test/test_device_reduce.cu @@ -666,7 +666,9 @@ void Initialize( int num_items) { for (int i = 0; i < num_items; ++i) + { InitValue(gen_mode, h_in[i], i); + } if (g_verbose_input) { @@ -926,8 +928,10 @@ void TestProblem( GenMode gen_mode, ReductionOpT reduction_op) { - // Initialize host data - printf("\n\nInitializing %s->%s (gen mode %d)... ", typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); + printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); + fflush(stdout); + + // Initialize value data InputT* h_in = new InputT[num_items]; Initialize(gen_mode, h_in, num_items); @@ -936,8 +940,8 @@ void TestProblem( InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input); // Initialize device data - int *d_segment_offsets = NULL; - InputT *d_in = NULL; + int *d_segment_offsets = NULL; + InputT *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1))); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); @@ -985,8 +989,8 @@ void TestByBackend( GenMode gen_mode) { // Initialize host data - printf("\n\nInitializing %s -> %s (gen mode %d)... ", - typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); + printf("\n\nInitializing %d %s -> %s (gen mode %d)... ", + num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); InputT *h_in = new InputT[num_items]; int *h_segment_offsets = new int[max_segments + 1]; @@ -1160,7 +1164,7 @@ void TestType( */ int main(int argc, char** argv) { - int max_items = 48000000; + int max_items = 27000000; int max_segments = 34000; // Initialize command line