From af707078e959c3797e16d90ad2530244211f0219 Mon Sep 17 00:00:00 2001
From: dumerrill <dumerrill@moochbot>
Date: Sun, 20 Nov 2016 22:13:07 -0700
Subject: [PATCH] Better fix for Issue #71 (KeyVairPair won't work if Key has
 non-trivial ctor)

https://github.com/NVlabs/cub/issues/71

Doc fixes for Issue #70 (1.5.5 breaks BlockScan API)

https://github.com/NVlabs/cub/issues/70
Former-commit-id: eb1c1432ad3df4d29dc055d45f4ccc3b00c8b2ba
---
 CHANGE_LOG.TXT                         | 23 +++++++++++++++++-----
 README.md                              |  2 +-
 cub/agent/agent_spmv_orig.cuh          |  2 +-
 cub/agent/agent_spmv_row_based.cuh     |  2 +-
 cub/device/device_reduce.cuh           |  4 ++--
 cub/device/device_segmented_reduce.cuh |  4 ++--
 cub/util_type.cuh                      | 27 ++++++++++++++++----------
 test/test_device_reduce.cu             | 18 ++++++++++-------
 8 files changed, 53 insertions(+), 29 deletions(-)
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
index 00d9cb05b6..7b146ef1da 100644
--- a/CHANGE_LOG.TXT
+++ b/CHANGE_LOG.TXT
@@ -1,20 +1,33 @@
-1.5.5    10/25/2016
+1.6.3    11/20/2016
+    - Updated GP100 tuning policies for radix sorting (6.2B 32b keys/s)
+    - Bug fixes: 
+        - Issue #74: Warpreduce executes reduction operator for out-of-bounds items
+        - Issue #72 (cub:InequalityWrapper::operator() should be non-const)
+        - Issue #71 (KeyVairPair won't work if Key has non-trivial ctor)
+        - Issue #68 (cub::TilePrefixCallbackOp::WarpReduce doesn't permit ptx 
+          arch specialization)
+		- Issue #70 1.5.3 breaks BlockScan API.  Retroactively reversioned 
+		  from v1.5.3 -> v1.6 to appropriately indicate API change.
+		  
+//-----------------------------------------------------------------------------
+
+1.6.2 (waws 1.5.5)    10/25/2016
     - Updated Pascal tuning policies for radix sorting
     - Bug fixes: 
         - Fix for arm64 compilation of caching allocator
 
 //-----------------------------------------------------------------------------
 
-1.5.4    10/14/2016
+1.6.1 (was 1.5.4)    10/14/2016
     - Bug fixes: 
         - Fix for radix sorting bug introduced by scan refactorization
 
 //-----------------------------------------------------------------------------
 
-1.5.3    10/11/2016
-    - Device/block/warp-wide exclusive scans now take an "initial value" (instead 
+1.6.0 (was 1.5.3)    10/11/2016
+    - API change: Device/block/warp-wide exclusive scans now take an "initial value" (instead 
       of an "identity value") for seeding the computation with an arbitrary prefix.  
-    - Device-wide reductions and scans can now have input sequence types that are 
+    - API change: Device-wide reductions and scans can now have input sequence types that are 
       differnet from output sequence types (as long as they are coercible)
       value") for seeding the computation with an arbitrary prefix
     - Reduce repository size (move doxygen binary to doc repository)
diff --git a/README.md b/README.md
index 495d96f5d3..8b8313ca71 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 <hr>
 <h3>About CUB</h3>
 
-Current release: v1.5.5 (10/25/2016)
+Current release: v1.6.0 (11/20/2016)
 
 We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.
 
diff --git a/cub/agent/agent_spmv_orig.cuh b/cub/agent/agent_spmv_orig.cuh
index 0babd7b77e..37871444c9 100644
--- a/cub/agent/agent_spmv_orig.cuh
+++ b/cub/agent/agent_spmv_orig.cuh
@@ -748,7 +748,7 @@ struct AgentSpmv
         }
 
         // Return the tile's running carry-out
-        KeyValuePairT tile_carry = {tile_num_rows, 0.0};
+        KeyValuePairT tile_carry(tile_num_rows, 0.0);
         return tile_carry;
     }
 */
diff --git a/cub/agent/agent_spmv_row_based.cuh b/cub/agent/agent_spmv_row_based.cuh
index 975903cb24..c5820112ec 100644
--- a/cub/agent/agent_spmv_row_based.cuh
+++ b/cub/agent/agent_spmv_row_based.cuh
@@ -430,7 +430,7 @@ struct AgentSpmv
         OffsetT tile_nonzero_idx        = temp_storage.tile_nonzero_idx;
         OffsetT tile_nonzero_idx_end    = temp_storage.tile_nonzero_idx_end;
 
-        KeyValuePairT       tile_prefix = {0, 0.0};
+        KeyValuePairT tile_prefix(0, 0.0);
         ReduceBySegmentOpT  scan_op;
         PrefixOpT           prefix_op(tile_prefix, scan_op);
 
diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh
index 1983b57b59..080d1b898a 100644
--- a/cub/device/device_reduce.cuh
+++ b/cub/device/device_reduce.cuh
@@ -373,7 +373,7 @@ struct DeviceReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputT initial_value = {1, Traits<InputValueT>::Max()};   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+        OutputT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
 
         return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
             d_temp_storage,
@@ -521,7 +521,7 @@ struct DeviceReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputT initial_value = {1, Traits<InputValueT>::Lowest()};     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+        OutputT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
 
         return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
             d_temp_storage,
diff --git a/cub/device/device_segmented_reduce.cuh b/cub/device/device_segmented_reduce.cuh
index 0dd5c14b78..07b05e9310 100644
--- a/cub/device/device_segmented_reduce.cuh
+++ b/cub/device/device_segmented_reduce.cuh
@@ -390,7 +390,7 @@ struct DeviceSegmentedReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputT initial_value = {1, Traits<InputValueT>::Max()};   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+        OutputT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
 
         return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
             d_temp_storage,
@@ -560,7 +560,7 @@ struct DeviceSegmentedReduce
         ArgIndexInputIteratorT d_indexed_in(d_in);
 
         // Initial value
-        OutputT initial_value = {1, Traits<InputValueT>::Lowest()};     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+        OutputT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
 
         return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
             d_temp_storage,
diff --git a/cub/util_type.cuh b/cub/util_type.cuh
index 138f9b803f..95bdfdc9e4 100644
--- a/cub/util_type.cuh
+++ b/cub/util_type.cuh
@@ -653,8 +653,8 @@ template <
     typename    _Key,
     typename    _Value
 #if defined(_WIN32) && !defined(_WIN64)
-    , bool KeyIsLT = sizeof(_Key) < sizeof(_Value),
-      bool ValIsLT = sizeof(_Value) < sizeof(_Key)
+    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
+    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
 #endif // #if defined(_WIN32) && !defined(_WIN64)
     >
 struct KeyValuePair
@@ -682,10 +682,17 @@ struct KeyValuePair
 
 #if defined(_WIN32) && !defined(_WIN64)
 
-// Need explicit padding to normalize value alignment and overall structure size
-// because the Win32 host compiler (VC++) may disagree with CUDA device C++ compilers
-// (EDG) on the member alignment and size of types passed as template parameters
-// through kernel functions
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
 
 /// Smaller key specialization
 template <typename K, typename V>
@@ -694,9 +701,9 @@ struct KeyValuePair<K, V, true, false>
     typedef K Key;
     typedef V Value;
 
-    typedef char Pad[sizeof(Value) - sizeof(Key)];
+    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
 
-    Value   value;
+    Value   value;  // Value has larger would-be alignment and goes first
     Key     key;
     Pad     pad;
 
@@ -723,9 +730,9 @@ struct KeyValuePair<K, V, false, true>
     typedef K Key;
     typedef V Value;
 
-    typedef char Pad[sizeof(Key) - sizeof(Value)];
+    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
 
-    Key     key;
+    Key     key;    // Key has larger would-be alignment and goes first
     Value   value;
     Pad     pad;
 
diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
index 6f88022b37..edc06c5117 100644
--- a/test/test_device_reduce.cu
+++ b/test/test_device_reduce.cu
@@ -666,7 +666,9 @@ void Initialize(
     int             num_items)
 {
     for (int i = 0; i < num_items; ++i)
+    {
         InitValue(gen_mode, h_in[i], i);
+    }
 
     if (g_verbose_input)
     {
@@ -926,8 +928,10 @@ void TestProblem(
     GenMode         gen_mode,
     ReductionOpT    reduction_op)
 {
-    // Initialize host data
-    printf("\n\nInitializing %s->%s (gen mode %d)... ", typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    fflush(stdout);
+
+    // Initialize value data
     InputT* h_in = new InputT[num_items];
     Initialize(gen_mode, h_in, num_items);
 
@@ -936,8 +940,8 @@ void TestProblem(
     InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
 
     // Initialize device data
-    int *d_segment_offsets      = NULL;
-    InputT *d_in                   = NULL;
+    int     *d_segment_offsets      = NULL;
+    InputT  *d_in                   = NULL;
     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
     CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
@@ -985,8 +989,8 @@ void TestByBackend(
     GenMode         gen_mode)
 {
     // Initialize host data
-    printf("\n\nInitializing %s -> %s (gen mode %d)... ",
-        typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
+        num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
 
     InputT  *h_in               = new InputT[num_items];
     int     *h_segment_offsets  = new int[max_segments + 1];
@@ -1160,7 +1164,7 @@ void TestType(
  */
 int main(int argc, char** argv)
 {
-    int max_items      = 48000000;
+    int max_items      = 27000000;
     int max_segments   = 34000;
 
     // Initialize command line