Merge pull request NVIDIA#1 from NVlabs/master

Master
GaryShen2008 · Oct 24, 2017 · e6b5bf5 · e6b5bf5
2 parents b165e1f + d622848
commit e6b5bf5
Show file tree

Hide file tree

Showing 45 changed files with 714 additions and 567 deletions.
diff --git a/.cproject b/.cproject
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
@@ -1,3 +1,23 @@
+1.7.4    09/20/2017
+    - Bug fixes: 
+        - Issue #114: Can't pair non-trivially-constructible values in radix sort
+        - Issue #115: WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32 
+
+//-----------------------------------------------------------------------------
+
+1.7.3    08/28/2017
+    - Bug fixes: 
+        - Issue #110: DeviceHistogram null-pointer exception bug for iterator inputs
+
+//-----------------------------------------------------------------------------
+
+1.7.2    08/26/2017
+    - Bug fixes: 
+        - Issue #104: Device-wide reduction is now "run-to-run" deterministic for 
+          pseudo-associative reduction operators (like floating point addition)
+
+//-----------------------------------------------------------------------------
+
 1.7.1    08/18/2017
     - Updated Volta radix sorting tuning policies 
     - Bug fixes: 
@@ -171,7 +191,7 @@
 1.3.2    07/28/2014
     - Bug fixes: 
         - Fix for cub::DeviceReduce where reductions of small problems 
-          (small enough to only dispatch a single threadblock) would run in 
+          (small enough to only dispatch a single thread block) would run in 
           the default stream (stream zero) regardless of whether an alternate
           stream was specified.  
 

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <hr>
 <h3>About CUB</h3>
 
-Current release: v1.7.1 (08/18/2017)
+Current release: v1.7.4 (09/20/2017)
 
 We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.
 

diff --git a/cub/agent/agent_histogram.cuh b/cub/agent/agent_histogram.cuh
@@ -746,7 +746,7 @@ struct AgentHistogram
                                         ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
 
         // Whether rows are aligned and can be vectorized
-        if (quad_aligned_rows || pixel_aligned_rows)
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
             ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
         else
             ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());

diff --git a/cub/agent/agent_radix_sort_downsweep.cuh b/cub/agent/agent_radix_sort_downsweep.cuh
@@ -171,6 +171,8 @@ struct AgentRadixSortDownsweep
         ITEMS_PER_THREAD,
         LOAD_ALGORITHM> BlockLoadValuesT;
 
+    // Value exchange array type
+    typedef ValueT ValueExchangeT[TILE_ITEMS];
 
     /**
      * Shared memory storage layout
@@ -187,7 +189,7 @@ struct AgentRadixSortDownsweep
             OffsetT                             relative_bin_offsets[RADIX_DIGITS];
         };
 
-        ValueT                                  exchange_values[TILE_ITEMS];
+        Uninitialized<ValueExchangeT>           exchange_values;
 
         OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
     };
@@ -276,18 +278,20 @@ struct AgentRadixSortDownsweep
     {
         CTA_SYNC();
 
+        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
         {
-            temp_storage.exchange_values[ranks[ITEM]] = values[ITEM];
+            exchange_values[ranks[ITEM]] = values[ITEM];
         }
 
         CTA_SYNC();
 
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
         {
-            ValueT value = temp_storage.exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
 
             if (FULL_TILE || 
                 (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))

diff --git a/cub/agent/agent_reduce.cuh b/cub/agent/agent_reduce.cuh
@@ -38,7 +38,6 @@
 #include "../block/block_load.cuh"
 #include "../block/block_reduce.cuh"
 #include "../grid/grid_mapping.cuh"
-#include "../grid/grid_queue.cuh"
 #include "../grid/grid_even_share.cuh"
 #include "../util_type.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
@@ -64,8 +63,7 @@ template <
     int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
     int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
     BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
-    CacheLoadModifier       _LOAD_MODIFIER,         ///< Cache load modifier for reading input elements
-    GridMappingStrategy     _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
+    CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
 struct AgentReducePolicy
 {
     enum
@@ -77,7 +75,6 @@ struct AgentReducePolicy
 
     static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
     static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;        ///< How to map tiles of input onto thread blocks
 };
 
 
@@ -148,7 +145,6 @@ struct AgentReduce
     struct _TempStorage
     {
         typename BlockReduceT::TempStorage  reduce;
-        OffsetT                             dequeue_offset;
     };
 
     /// Alias wrapper allowing storage to be unioned
@@ -314,36 +310,35 @@ struct AgentReduce
      */
     template <int CAN_VECTORIZE>
     __device__ __forceinline__ OutputT ConsumeRange(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end,                          ///< [in] Threadblock end offset (exclusive)
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
         Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
     {
         OutputT thread_aggregate;
 
-        if (block_offset + TILE_ITEMS > block_end)
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
         {
             // First tile isn't full (not all threads have valid items)
-            int valid_items = block_end - block_offset;
-            ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
             return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
         }
 
         // At least one full block
-        ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-        block_offset += TILE_ITEMS;
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
 
         // Consume subsequent full tiles of input
-        while (block_offset + TILE_ITEMS <= block_end)
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
         {
-            ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-            block_offset += TILE_ITEMS;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
         }
 
         // Consume a partially-full tile
-        if (block_offset < block_end)
+        if (even_share.block_offset < even_share.block_end)
         {
-            int valid_items = block_end - block_offset;
-            ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
         }
 
         // Compute block-wide reduction (all threads have valid items)
@@ -358,113 +353,28 @@ struct AgentReduce
         OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
         OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
     {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
         return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(block_offset, block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(block_offset, block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
     }
 
 
     /**
      * Reduce a contiguous segment of input tiles
      */
     __device__ __forceinline__ OutputT ConsumeTiles(
-        OffsetT                             /*num_items*/,      ///< [in] Total number of global input items
-        GridEvenShare<OffsetT>              &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<OffsetT>                  &/*queue*/,         ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_EVEN_SHARE>   /*is_even_share*/)  ///< [in] Marker type indicating this is an even-share mapping
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
     {
-        // Initialize even-share descriptor for this thread block
-        even_share.BlockInit();
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
 
         return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
-
-    }
-
-
-    //---------------------------------------------------------------------
-    // Dynamically consume tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block reduction
-     */
-    template <int CAN_VECTORIZE>
-    __device__ __forceinline__ OutputT ConsumeTiles(
-        int                     num_items,          ///< Total number of input items
-        GridQueue<OffsetT>      queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
-    {
-        // We give each thread block at least one tile of input.
-        OutputT thread_aggregate;
-        OffsetT block_offset = blockIdx.x * TILE_ITEMS;
-        OffsetT even_share_base = gridDim.x * TILE_ITEMS;
-
-        if (block_offset + TILE_ITEMS > num_items)
-        {
-            // First tile isn't full (not all threads have valid items)
-            int valid_items = num_items - block_offset;
-            ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
-            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
-        }
-
-        // Consume first full tile of input
-        ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
 
-        if (num_items > even_share_base)
-        {
-            // Dequeue a tile of items
-            if (threadIdx.x == 0)
-                temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-            CTA_SYNC();
-
-            // Grab tile offset and check if we're done with full tiles
-            block_offset = temp_storage.dequeue_offset;
-
-            // Consume more full tiles
-            while (block_offset + TILE_ITEMS <= num_items)
-            {
-                ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-
-                CTA_SYNC();
-
-                // Dequeue a tile of items
-                if (threadIdx.x == 0)
-                    temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-                CTA_SYNC();
-
-                // Grab tile offset and check if we're done with full tiles
-                block_offset = temp_storage.dequeue_offset;
-            }
-
-            // Consume partial tile
-            if (block_offset < num_items)
-            {
-                int valid_items = num_items - block_offset;
-                ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
-            }
-        }
-
-        // Compute block-wide reduction (all threads have valid items)
-        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
-
-    }
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block reduction
-     */
-    __device__ __forceinline__ OutputT ConsumeTiles(
-        OffsetT                         num_items,          ///< [in] Total number of global input items
-        GridEvenShare<OffsetT>          &/*even_share*/,    ///< [in] GridEvenShare descriptor
-        GridQueue<OffsetT>              &queue,             ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_DYNAMIC>  /*is_dynamic*/)     ///< [in] Marker type indicating this is a dynamic mapping
-    {
-        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeTiles(num_items, queue, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeTiles(num_items, queue, Int2Type<false && ATTEMPT_VECTORIZATION>());
     }
 
 };

diff --git a/cub/agent/agent_reduce_by_key.cuh b/cub/agent/agent_reduce_by_key.cuh
@@ -223,7 +223,7 @@ struct AgentReduceByKey
     typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
     typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     union _TempStorage
     {
         struct

diff --git a/cub/agent/agent_rle.cuh b/cub/agent/agent_rle.cuh
@@ -208,7 +208,7 @@ struct AgentRle
 
     typedef LengthOffsetPair WarpAggregates[WARPS];
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     struct _TempStorage
     {
         // Aliasable storage layout

diff --git a/cub/agent/agent_scan.cuh b/cub/agent/agent_scan.cuh
@@ -163,7 +163,7 @@ struct AgentScan
             ScanOpT>
         RunningPrefixCallbackOp;
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     union _TempStorage
     {
         typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading

diff --git a/cub/agent/agent_segment_fixup.cuh b/cub/agent/agent_segment_fixup.cuh
@@ -162,7 +162,7 @@ struct AgentSegmentFixup
             ScanTileStateT>
         TilePrefixCallbackOpT;
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     union _TempStorage
     {
         struct

diff --git a/cub/agent/agent_select_if.cuh b/cub/agent/agent_select_if.cuh
@@ -192,7 +192,7 @@ struct AgentSelectIf
     // Item exchange type
     typedef OutputT ItemExchangeT[TILE_ITEMS];
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     union _TempStorage
     {
         struct

diff --git a/cub/block/block_radix_rank.cuh b/cub/block/block_radix_rank.cuh
@@ -28,7 +28,7 @@
 
 /**
  * \file
- * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
  */
 
 #pragma once
@@ -51,7 +51,7 @@ CUB_NS_PREFIX
 namespace cub {
 
 /**
- * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
  * \ingroup BlockModule
  *
  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
@@ -398,7 +398,7 @@ public:
         // Extract the local ranks of each key
         for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
         {
-            // Add in threadblock exclusive prefix
+            // Add in thread block exclusive prefix
             ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
         }
     }