- Added new sorting interface that does not disturb input array

- Update to ArgMin/Max functors for more efficient codegen - Update to histogram interface to provide NPP/IPP-like functionality - Remove all traces of 8B smem mode setting (which could cause serializations in streams) Former-commit-id: df811c0
GaryShen2008 · Aug 11, 2014 · 7aa5005 · 7aa5005
1 parent 109e0f7
commit 7aa5005
Show file tree

Hide file tree

Showing 73 changed files with 2,661 additions and 2,399 deletions.
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
@@ -59,7 +59,7 @@
 1.2.3    04/01/2014
     - Bug fixes: 
         - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types
-        - Fixed code-snippet bug in ArgIndexInputIterator documentation 
+        - Fixed code-snippet bug in ArgIndexInputIteratorT documentation 
 
 //-----------------------------------------------------------------------------
 
@@ -87,13 +87,13 @@
         - Revised the operation of temporary tile status bookkeeping for DeviceScan (and similar) to be safe for current code run on future platforms (now uses proper fences)  
         - Fixed DeviceScan bug where Win32 alignment disagreements between host and device regarding user-defined data types would corrupt tile status
         - Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block
-        - Added workaround for TexRefInputIterator to work with CUDA 6.0
+        - Added workaround for TexRefInputIteratorTto work with CUDA 6.0
 
 //-----------------------------------------------------------------------------
 
 1.1.1    12/11/2013
     - New features:
-        - Added TexObjInputIterator, TexRefInputIterator, CacheModifiedInputIterator, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy.  Compatible with Thrust API. 
+        - Added TexObjInputIteratorT, TexRefInputIteratorT, CacheModifiedInputIteratorT, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy.  Compatible with Thrust API. 
         - Added descending sorting to DeviceRadixSort and BlockRadixSort
         - Added min, max, arg-min, and arg-max to DeviceReduce
         - Added DeviceSelect (select-unique, select-if, and select-flagged)
@@ -103,7 +103,7 @@
     - Performance
         - Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
     - Interface changes:
-        - Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces.  The CacheModifiedInputIterator and CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
+        - Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces.  The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
         - Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
     - Documentation and testing:
         - Added simple examples of device-wide methods

diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ __global__ void BlockSortKernel(int *d_in, int *d_out)
          typename BlockStore::TempStorage      store; 
      } temp_storage; 
 
-     int block_offset = blockIdx.x * (128 * 16);	  // Offset for this block's ment
+     int block_offset = blockIdx.x * (128 * 16);	  // OffsetT for this block's ment
 
      // Obtain a segment of 2048 consecutive keys that are blocked across threads
      int thread_keys[16];

diff --git a/cub/block/block_discontinuity.cuh b/cub/block/block_discontinuity.cuh
@@ -146,7 +146,7 @@ private:
     struct ApplyOp
     {
         // Apply flag operator
-        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
         {
             return flag_op(a, b, idx);
         }
@@ -157,7 +157,7 @@ private:
     struct ApplyOp<FlagOp, false>
     {
         // Apply flag operator
-        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
         {
             return flag_op(a, b);
         }
@@ -178,7 +178,7 @@ private:
             T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
             FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
         {
-            flags[ITERATION] = ApplyOp<FlagOp>::Flag(
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
                 flag_op,
                 input[ITERATION - 1],
                 input[ITERATION],
@@ -198,7 +198,7 @@ private:
             T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
             FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
         {
-            flags[ITERATION] = ApplyOp<FlagOp>::Flag(
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
                 flag_op,
                 input[ITERATION],
                 input[ITERATION + 1],
@@ -355,7 +355,7 @@ public:
         // Set flag for first thread-item
         head_flags[0] = (linear_tid == 0) ?
             1 :                                 // First thread
-            ApplyOp<FlagOp>::Flag(
+            ApplyOp<FlagOp>::FlagT(
                 flag_op,
                 temp_storage.last_items[linear_tid - 1],
                 input[0],
@@ -441,7 +441,7 @@ public:
             tile_predecessor_item :              // First thread
             temp_storage.last_items[linear_tid - 1];
 
-        head_flags[0] = ApplyOp<FlagOp>::Flag(
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
             flag_op,
             predecessor_item,
             input[0],
@@ -526,7 +526,7 @@ public:
         // Set flag for last thread-item
         tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
             1 :                             // Last thread
-            ApplyOp<FlagOp>::Flag(
+            ApplyOp<FlagOp>::FlagT(
                 flag_op,
                 input[ITEMS_PER_THREAD - 1],
                 temp_storage.first_items[linear_tid + 1],
@@ -613,7 +613,7 @@ public:
             tile_successor_item :              // Last thread
             temp_storage.first_items[linear_tid + 1];
 
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
             flag_op,
             input[ITEMS_PER_THREAD - 1],
             successor_item,
@@ -710,7 +710,7 @@ public:
         // Set flag for first thread-item
         head_flags[0] = (linear_tid == 0) ?
             1 :                                 // First thread
-            ApplyOp<FlagOp>::Flag(
+            ApplyOp<FlagOp>::FlagT(
                 flag_op,
                 temp_storage.last_items[linear_tid - 1],
                 input[0],
@@ -719,7 +719,7 @@ public:
         // Set flag for last thread-item
         tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
             1 :                             // Last thread
-            ApplyOp<FlagOp>::Flag(
+            ApplyOp<FlagOp>::FlagT(
                 flag_op,
                 input[ITEMS_PER_THREAD - 1],
                 temp_storage.first_items[linear_tid + 1],
@@ -818,7 +818,7 @@ public:
         // Set flag for first thread-item
         head_flags[0] = (linear_tid == 0) ?
             1 :                                 // First thread
-            ApplyOp<FlagOp>::Flag(
+            ApplyOp<FlagOp>::FlagT(
                 flag_op,
                 temp_storage.last_items[linear_tid - 1],
                 input[0],
@@ -829,7 +829,7 @@ public:
             tile_successor_item :              // Last thread
             temp_storage.first_items[linear_tid + 1];
 
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
             flag_op,
             input[ITEMS_PER_THREAD - 1],
             successor_item,
@@ -936,7 +936,7 @@ public:
             tile_predecessor_item :              // First thread
             temp_storage.last_items[linear_tid - 1];
 
-        head_flags[0] = ApplyOp<FlagOp>::Flag(
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
             flag_op,
             predecessor_item,
             input[0],
@@ -945,7 +945,7 @@ public:
         // Set flag for last thread-item
         tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
             1 :                             // Last thread
-            ApplyOp<FlagOp>::Flag(
+            ApplyOp<FlagOp>::FlagT(
                 flag_op,
                 input[ITEMS_PER_THREAD - 1],
                 temp_storage.first_items[linear_tid + 1],
@@ -1054,7 +1054,7 @@ public:
             tile_predecessor_item :              // First thread
             temp_storage.last_items[linear_tid - 1];
 
-        head_flags[0] = ApplyOp<FlagOp>::Flag(
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
             flag_op,
             predecessor_item,
             input[0],
@@ -1065,7 +1065,7 @@ public:
             tile_successor_item :              // Last thread
             temp_storage.first_items[linear_tid + 1];
 
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
             flag_op,
             input[ITEMS_PER_THREAD - 1],
             successor_item,

diff --git a/cub/block/block_exchange.cuh b/cub/block/block_exchange.cuh
@@ -511,10 +511,10 @@ private:
     /**
      * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
      */
-    template <typename Offset>
+    template <typename OffsetT>
     __device__ __forceinline__ void ScatterToBlocked(
         T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
         Int2Type<false> time_slicing)
     {
         #pragma unroll
@@ -539,10 +539,10 @@ private:
     /**
      * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
      */
-    template <typename Offset>
+    template <typename OffsetT>
     __device__ __forceinline__ void ScatterToBlocked(
         T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
         Int2Type<true>  time_slicing)
     {
         T temp_items[ITEMS_PER_THREAD];
@@ -591,10 +591,10 @@ private:
     /**
      * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
      */
-    template <typename Offset>
+    template <typename OffsetT>
     __device__ __forceinline__ void ScatterToStriped(
         T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
         Int2Type<false> time_slicing)
     {
         #pragma unroll
@@ -620,10 +620,10 @@ private:
     /**
      * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
      */
-    template <typename Offset>
+    template <typename OffsetT>
     __device__ __forceinline__ void ScatterToStriped(
         T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
         Int2Type<true> time_slicing)
     {
         T temp_items[ITEMS_PER_THREAD];
@@ -911,12 +911,12 @@ public:
      * \par
      * - \smemreuse
      *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
      */
-    template <typename Offset>
+    template <typename OffsetT>
     __device__ __forceinline__ void ScatterToBlocked(
         T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+        OffsetT         ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
     {
         ScatterToBlocked(items, ranks, Int2Type<WARP_TIME_SLICING>());
     }
@@ -928,12 +928,12 @@ public:
      * \par
      * - \smemreuse
      *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
      */
-    template <typename Offset>
+    template <typename OffsetT>
     __device__ __forceinline__ void ScatterToStriped(
         T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+        OffsetT         ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
     {
         ScatterToStriped(items, ranks, Int2Type<WARP_TIME_SLICING>());
     }
@@ -945,12 +945,12 @@ public:
      * \par
      * - \smemreuse
      *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
      */
-    template <typename Offset>
+    template <typename OffsetT>
     __device__ __forceinline__ void ScatterToStripedGuarded(
         T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -978,13 +978,13 @@ public:
      * \par
      * - \smemreuse
      *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
-     * \tparam ValidFlag                            <b>[inferred]</b> Flag type denoting which items are valid
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
      */
-    template <typename Offset, typename ValidFlag>
+    template <typename OffsetT, typename ValidFlag>
     __device__ __forceinline__ void ScatterToStriped(
         T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        OffsetT         ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
         ValidFlag       is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
     {
         #pragma unroll
@@ -1091,12 +1091,12 @@ public:
      * \par
      * - \smemreuse
      *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
      */
-    template <typename Offset>
+    template <typename OffsetT>
     __device__ __forceinline__ void ScatterToStriped(
         T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)

diff --git a/cub/block/block_histogram.cuh b/cub/block/block_histogram.cuh
@@ -285,10 +285,10 @@ public:
      *
      * \endcode
      *
-     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
+     * \tparam HistoCounterT         <b>[inferred]</b> Histogram counter type
      */
-    template <typename HistoCounter>
-    __device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
+    template <typename HistoCounterT>
+    __device__ __forceinline__ void InitHistogram(HistoCounterT histogram[BINS])
     {
         // Initialize histogram bin counts to zeros
         int histo_offset = 0;
@@ -340,13 +340,13 @@ public:
      *
      * \endcode
      *
-     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
+     * \tparam HistoCounterT         <b>[inferred]</b> Histogram counter type
      */
     template <
-        typename            HistoCounter>
+        typename            HistoCounterT>
     __device__ __forceinline__ void Histogram(
         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                ///< [out] Reference to shared/global memory histogram
+        HistoCounterT        histogram[BINS])                ///< [out] Reference to shared/global memory histogram
     {
         // Initialize histogram bin counts to zeros
         InitHistogram(histogram);
@@ -397,13 +397,13 @@ public:
      *
      * \endcode
      *
-     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
+     * \tparam HistoCounterT         <b>[inferred]</b> Histogram counter type
      */
     template <
-        typename            HistoCounter>
+        typename            HistoCounterT>
     __device__ __forceinline__ void Composite(
         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
+        HistoCounterT        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
     {
         InternalBlockHistogram(temp_storage).Composite(items, histogram);
     }