Better fix for Issue NVIDIA#71 (KeyVairPair won't work if Key has non…

…-trivial ctor) https://github.com/NVlabs/cub/issues/71 Doc fixes for Issue NVIDIA#70 (1.5.5 breaks BlockScan API) https://github.com/NVlabs/cub/issues/70 Former-commit-id: eb1c1432ad3df4d29dc055d45f4ccc3b00c8b2ba
GaryShen2008 · Nov 21, 2016 · af70707 · af70707
1 parent 7b34462
commit af70707
Show file tree

Hide file tree

Showing 8 changed files with 53 additions and 29 deletions.
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
@@ -1,20 +1,33 @@
-1.5.5    10/25/2016
+1.6.3    11/20/2016
+    - Updated GP100 tuning policies for radix sorting (6.2B 32b keys/s)
+    - Bug fixes: 
+        - Issue #74: Warpreduce executes reduction operator for out-of-bounds items
+        - Issue #72 (cub:InequalityWrapper::operator() should be non-const)
+        - Issue #71 (KeyVairPair won't work if Key has non-trivial ctor)
+        - Issue #68 (cub::TilePrefixCallbackOp::WarpReduce doesn't permit ptx 
+          arch specialization)
+		- Issue #70 1.5.3 breaks BlockScan API.  Retroactively reversioned 
+		  from v1.5.3 -> v1.6 to appropriately indicate API change.
+
+//-----------------------------------------------------------------------------
+
+1.6.2 (waws 1.5.5)    10/25/2016
     - Updated Pascal tuning policies for radix sorting
     - Bug fixes: 
         - Fix for arm64 compilation of caching allocator
 
 //-----------------------------------------------------------------------------
 
-1.5.4    10/14/2016
+1.6.1 (was 1.5.4)    10/14/2016
     - Bug fixes: 
         - Fix for radix sorting bug introduced by scan refactorization
 
 //-----------------------------------------------------------------------------
 
-1.5.3    10/11/2016
-    - Device/block/warp-wide exclusive scans now take an "initial value" (instead 
+1.6.0 (was 1.5.3)    10/11/2016
+    - API change: Device/block/warp-wide exclusive scans now take an "initial value" (instead 
       of an "identity value") for seeding the computation with an arbitrary prefix.  
-    - Device-wide reductions and scans can now have input sequence types that are 
+    - API change: Device-wide reductions and scans can now have input sequence types that are 
       differnet from output sequence types (as long as they are coercible)
       value") for seeding the computation with an arbitrary prefix
     - Reduce repository size (move doxygen binary to doc repository)

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <hr>
 <h3>About CUB</h3>
 
-Current release: v1.5.5 (10/25/2016)
+Current release: v1.6.0 (11/20/2016)
 
 We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.
 

diff --git a/cub/agent/agent_spmv_orig.cuh b/cub/agent/agent_spmv_orig.cuh
@@ -748,7 +748,7 @@ struct AgentSpmv
         }
 
         // Return the tile's running carry-out
-        KeyValuePairT tile_carry = {tile_num_rows, 0.0};
+        KeyValuePairT tile_carry(tile_num_rows, 0.0);
         return tile_carry;
     }
 */

diff --git a/cub/agent/agent_spmv_row_based.cuh b/cub/agent/agent_spmv_row_based.cuh
@@ -430,7 +430,7 @@ struct AgentSpmv
         OffsetT tile_nonzero_idx        = temp_storage.tile_nonzero_idx;
         OffsetT tile_nonzero_idx_end    = temp_storage.tile_nonzero_idx_end;
 
-        KeyValuePairT       tile_prefix = {0, 0.0};
+        KeyValuePairT tile_prefix(0, 0.0);
         ReduceBySegmentOpT  scan_op;
         PrefixOpT           prefix_op(tile_prefix, scan_op);
 

diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh
@@ -373,7 +373,7 @@ struct DeviceReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputT initial_value = {1, Traits<InputValueT>::Max()};   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+        OutputT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
 
         return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
             d_temp_storage,
@@ -521,7 +521,7 @@ struct DeviceReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputT initial_value = {1, Traits<InputValueT>::Lowest()};     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+        OutputT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
 
         return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
             d_temp_storage,

diff --git a/cub/device/device_segmented_reduce.cuh b/cub/device/device_segmented_reduce.cuh
@@ -390,7 +390,7 @@ struct DeviceSegmentedReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputT initial_value = {1, Traits<InputValueT>::Max()};   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+        OutputT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
 
         return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
             d_temp_storage,
@@ -560,7 +560,7 @@ struct DeviceSegmentedReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputT initial_value = {1, Traits<InputValueT>::Lowest()};     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+        OutputT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
 
         return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
             d_temp_storage,

diff --git a/cub/util_type.cuh b/cub/util_type.cuh
@@ -653,8 +653,8 @@ template <
     typename    _Key,
     typename    _Value
 #if defined(_WIN32) && !defined(_WIN64)
-    , bool KeyIsLT = sizeof(_Key) < sizeof(_Value),
-      bool ValIsLT = sizeof(_Value) < sizeof(_Key)
+    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
+    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
 #endif // #if defined(_WIN32) && !defined(_WIN64)
     >
 struct KeyValuePair
@@ -682,10 +682,17 @@ struct KeyValuePair
 
 #if defined(_WIN32) && !defined(_WIN64)
 
-// Need explicit padding to normalize value alignment and overall structure size
-// because the Win32 host compiler (VC++) may disagree with CUDA device C++ compilers
-// (EDG) on the member alignment and size of types passed as template parameters
-// through kernel functions
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
 
 /// Smaller key specialization
 template <typename K, typename V>
@@ -694,9 +701,9 @@ struct KeyValuePair<K, V, true, false>
     typedef K Key;
     typedef V Value;
 
-    typedef char Pad[sizeof(Value) - sizeof(Key)];
+    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
 
-    Value   value;
+    Value   value;  // Value has larger would-be alignment and goes first
     Key     key;
     Pad     pad;
 
@@ -723,9 +730,9 @@ struct KeyValuePair<K, V, false, true>
     typedef K Key;
     typedef V Value;
 
-    typedef char Pad[sizeof(Key) - sizeof(Value)];
+    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
 
-    Key     key;
+    Key     key;    // Key has larger would-be alignment and goes first
     Value   value;
     Pad     pad;
 

diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
@@ -666,7 +666,9 @@ void Initialize(
     int             num_items)
 {
     for (int i = 0; i < num_items; ++i)
+    {
         InitValue(gen_mode, h_in[i], i);
+    }
 
     if (g_verbose_input)
     {
@@ -926,8 +928,10 @@ void TestProblem(
     GenMode         gen_mode,
     ReductionOpT    reduction_op)
 {
-    // Initialize host data
-    printf("\n\nInitializing %s->%s (gen mode %d)... ", typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    fflush(stdout);
+
+    // Initialize value data
     InputT* h_in = new InputT[num_items];
     Initialize(gen_mode, h_in, num_items);
 
@@ -936,8 +940,8 @@ void TestProblem(
     InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
 
     // Initialize device data
-    int *d_segment_offsets      = NULL;
-    InputT *d_in                   = NULL;
+    int     *d_segment_offsets      = NULL;
+    InputT  *d_in                   = NULL;
     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
     CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
@@ -985,8 +989,8 @@ void TestByBackend(
     GenMode         gen_mode)
 {
     // Initialize host data
-    printf("\n\nInitializing %s -> %s (gen mode %d)... ",
-        typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
+        num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
 
     InputT  *h_in               = new InputT[num_items];
     int     *h_segment_offsets  = new int[max_segments + 1];
@@ -1160,7 +1164,7 @@ void TestType(
  */
 int main(int argc, char** argv)
 {
-    int max_items      = 48000000;
+    int max_items      = 27000000;
     int max_segments   = 34000;
 
     // Initialize command line