- Doc updates for v1.3.1

- Fix for bug in cub::DeviceRadixSort where the algorithm may sort more key bits than the caller specified (up to the nearest radix digit). - Fix for a cub::DeviceRadixSort performance regression (~3%) on Kepler and Fermi that was introduced in v1.3.0. Former-commit-id: b6e4280
GaryShen2008 · May 22, 2014 · e89ba82 · e89ba82
1 parent f323274
commit e89ba82
Show file tree

Hide file tree

Showing 290 changed files with 1,744 additions and 1,347 deletions.
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
@@ -1,25 +1,39 @@
 //-----------------------------------------------------------------------------
 
-1.3.0    03/03/2014
+1.3.1    05/22/2014
     - New features:
-    	- CUB's collective (block-wide, warp-wide) primitives underwent a minor 
-    	  interface refactoring:
-    	    - To provide the appropriate support for multidimensional thread blocks,
-    	      The interfaces for collective classes are now template-parameterized 
-    	      by X, Y, and Z block dimensions (with BLOCK_DIM_Y and BLOCK_DIM_Z being 
-    	      optional, and BLOCK_DIM_X replacing BLOCK_THREADS).  Furthermore, the 
-    	      constructors that accept remapped linear thread-identifiers have been 
-    	      removed: all primitives now assume a row-major thread-ranking for 
-    	      multidimensional thread blocks.  
-    	    - To allow the host program (compiled by the host-pass) to 
-    	      accurately determine the device-specific storage requirements for 
-    	      a given collective (compiled for each device-pass), the interfaces 
-    	      for collective classes are now (optionally) template-parameterized 
-    	      by the desired PTX compute capability. This is useful when 
-    	      aliasing collective storage to shared memory that has been 
-    	      allocated dynamically by the host at the kernel call site.   
-    	    - Most CUB programs having typical 1D usage should not require any 
-    	      changes to accomodate these updates.
+        - Added new "combination" WarpScan methods for efficiently computing 
+          both inclusive and exclusive prefix scans (and sums).
+    - Bug fixes: 
+        - Workaround for a benign WAW race warning reported by cuda-memcheck
+          in BlockScan specialized for BLOCK_SCAN_WARP_SCANS algorithm.
+        - Fix for bug in DeviceRadixSort where the algorithm may sort more 
+          key bits than the caller specified (up to the nearest radix digit).
+        - Fix for ~3% DeviceRadixSort performance regression on Kepler and 
+          Fermi that was introduced in v1.3.0.  
+
+//-----------------------------------------------------------------------------
+
+1.3.0    05/12/2014
+    - New features:
+        - CUB's collective (block-wide, warp-wide) primitives underwent a minor 
+          interface refactoring:
+            - To provide the appropriate support for multidimensional thread blocks,
+              The interfaces for collective classes are now template-parameterized 
+              by X, Y, and Z block dimensions (with BLOCK_DIM_Y and BLOCK_DIM_Z being 
+              optional, and BLOCK_DIM_X replacing BLOCK_THREADS).  Furthermore, the 
+              constructors that accept remapped linear thread-identifiers have been 
+              removed: all primitives now assume a row-major thread-ranking for 
+              multidimensional thread blocks.  
+            - To allow the host program (compiled by the host-pass) to 
+              accurately determine the device-specific storage requirements for 
+              a given collective (compiled for each device-pass), the interfaces 
+              for collective classes are now (optionally) template-parameterized 
+              by the desired PTX compute capability. This is useful when 
+              aliasing collective storage to shared memory that has been 
+              allocated dynamically by the host at the kernel call site.   
+            - Most CUB programs having typical 1D usage should not require any 
+              changes to accomodate these updates.
     - Bug fixes: 
         - Fixed bug in cub::WarpScan (which affected cub::BlockScan and 
           cub::DeviceScan) where incorrect results (e.g., NAN) would often be 
@@ -34,7 +48,7 @@
 
 //-----------------------------------------------------------------------------
 
-1.2.3    03/03/2014
+1.2.3    04/01/2014
     - Bug fixes: 
         - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types
         - Fixed code-snippet bug in ArgIndexInputIterator documentation 
@@ -43,18 +57,25 @@
 
 1.2.2    03/03/2014
     - New features:
-        - Added device-wide reduce-by-key (DeviceReduce::ReduceByKey, DeviceReduce::RunLengthEncode)
         - Added MS VC++ project solutions for device-wide and block-wide examples 
     - Performance:
-        - Improved DeviceScan, DeviceSelect, DevicePartition performance
         - Added a third algorithmic variant of cub::BlockReduce for improved performance
           when using commutative operators (e.g., numeric addition)
+    - Bug fixes: 
+        - Fixed bug where inclusion of Thrust headers in a certain order prevented CUB device-wide primitives from working properly
+
+//-----------------------------------------------------------------------------
+
+1.2.0    02/25/2014
+    - New features:
+        - Added device-wide reduce-by-key (DeviceReduce::ReduceByKey, DeviceReduce::RunLengthEncode) 
+    - Performance
+        - Improved DeviceScan, DeviceSelect, DevicePartition performance
     - Documentation and testing:
         - Compatible with CUDA 6.0
-        - Added performance-portabiltiy plots for many device-wide primitives to doc 
+        - Added performance-portability plots for many device-wide primitives to doc 
         - Update doc and tests to reflect iterator (in)compatibilities with CUDA 5.0 (and older) and Thrust 1.6 (and older).
-    - Bug fixes: 
-        - Fixed bug where inclusion of Thrust headers in a certain order prevented CUB device-wide primitives from working properly
+    - Bug fixes 
         - Revised the operation of temporary tile status bookkeeping for DeviceScan (and similar) to be safe for current code run on future platforms (now uses proper fences)  
         - Fixed DeviceScan bug where Win32 alignment disagreements between host and device regarding user-defined data types would corrupt tile status
         - Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block

diff --git a/cub/block/block_radix_rank.cuh b/cub/block/block_radix_rank.cuh
@@ -198,13 +198,17 @@ private:
             UnsignedBits    (&keys)[KEYS_PER_THREAD],               // Key to decode
             DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value (out parameter)
             DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],     // Counter smem offset (out parameter)
-            int             current_bit)                            // The least-significant bit position of the current digit to extract
+            int             current_bit,                            // The least-significant bit position of the current digit to extract
+            int             num_bits)                               // The number of bits in the current digit
         {
+            // Get digit
+            UnsignedBits digit = BFE(keys[COUNT], current_bit, num_bits);
+
             // Get sub-counter
-            UnsignedBits sub_counter = BFE(keys[COUNT], current_bit + LOG_COUNTER_LANES, LOG_PACKING_RATIO);
+            UnsignedBits sub_counter = digit >> LOG_COUNTER_LANES;
 
             // Get counter lane
-            UnsignedBits counter_lane = BFE(keys[COUNT], current_bit, LOG_COUNTER_LANES);
+            UnsignedBits counter_lane = digit & (COUNTER_LANES - 1);
 
             if (DESCENDING)
             {
@@ -222,7 +226,7 @@ private:
             *digit_counters[COUNT] = thread_prefixes[COUNT] + 1;
 
             // Iterate next key
-            Iterate<COUNT + 1, MAX>::DecodeKeys(cta, keys, thread_prefixes, digit_counters, current_bit);
+            Iterate<COUNT + 1, MAX>::DecodeKeys(cta, keys, thread_prefixes, digit_counters, current_bit, num_bits);
         }
 
 
@@ -253,15 +257,18 @@ private:
             UnsignedBits    (&keys)[KEYS_PER_THREAD],
             DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
             DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],
-            int             current_bit) {}
+            int             current_bit,                            // The least-significant bit position of the current digit to extract
+            int             num_bits)                               // The number of bits in the current digit
+        {}
 
 
         // UpdateRanks
         template <int KEYS_PER_THREAD>
         static __device__ __forceinline__ void UpdateRanks(
             int             (&ranks)[KEYS_PER_THREAD],
             DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
-            DigitCounter    *(&digit_counters)[KEYS_PER_THREAD]) {}
+            DigitCounter    *(&digit_counters)[KEYS_PER_THREAD])
+        {}
     };
 
 
@@ -416,7 +423,8 @@ public:
     __device__ __forceinline__ void RankKeys(
         UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
         int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit)                        ///< [in] The least-significant bit position of the current digit to extract
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
     {
         DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
         DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
@@ -425,7 +433,7 @@ public:
         ResetCounters();
 
         // Decode keys and update digit counters
-        Iterate<0, KEYS_PER_THREAD>::DecodeKeys(*this, keys, thread_prefixes, digit_counters, current_bit);
+        Iterate<0, KEYS_PER_THREAD>::DecodeKeys(*this, keys, thread_prefixes, digit_counters, current_bit, num_bits);
 
         __syncthreads();
 
@@ -449,10 +457,11 @@ public:
         UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
         int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
         int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
         int             &inclusive_digit_prefix)            ///< [out] The incluisve prefix sum for the digit threadIdx.x
     {
         // Rank keys
-        RankKeys(keys, ranks, current_bit);
+        RankKeys(keys, ranks, current_bit, num_bits);
 
         // Get the inclusive and exclusive digit totals corresponding to the calling thread.
         if ((BLOCK_THREADS == RADIX_DIGITS) || (linear_tid < RADIX_DIGITS))

diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh
@@ -220,12 +220,14 @@ private:
         UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
         int             (&ranks)[ITEMS_PER_THREAD],
         int             begin_bit,
+        int             pass_bits,
         Int2Type<false> is_descending)
     {
         AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
             unsigned_keys,
             ranks,
-            begin_bit);
+            begin_bit,
+            pass_bits);
     }
 
     /// Rank keys (specialized for descending sort)
@@ -299,9 +301,11 @@ private:
         // Radix sorting passes
         while (true)
         {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
             // Rank the blocked keys
             int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, is_descending);
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
             begin_bit += RADIX_BITS;
 
             __syncthreads();
@@ -349,9 +353,11 @@ private:
         // Radix sorting passes
         while (true)
         {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
             // Rank the blocked keys
             int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, is_descending);
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
             begin_bit += RADIX_BITS;
 
             __syncthreads();

diff --git a/cub/block_range/block_range_radix_sort_downsweep.cuh b/cub/block_range/block_range_radix_sort_downsweep.cuh
@@ -242,6 +242,9 @@ struct BlockRangeRadixSortDownsweep
     // The least-significant bit position of the current digit to extract
     int             current_bit;
 
+    // Number of bits in current digit
+    int             num_bits;
+
     // Whether to short-ciruit
     bool            short_circuit;
 
@@ -261,7 +264,7 @@ struct BlockRangeRadixSortDownsweep
         #pragma unroll
         for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
         {
-            UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, RADIX_BITS);
+            UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, num_bits);
 
             // Lookup base digit offset from shared memory
             relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit];
@@ -522,6 +525,7 @@ struct BlockRangeRadixSortDownsweep
             twiddled_keys,
             ranks,
             current_bit,
+            num_bits,
             inclusive_digit_prefix);
 
         // Update global scatter base offsets for each digit
@@ -639,7 +643,8 @@ struct BlockRangeRadixSortDownsweep
         Key         *d_keys_out,
         Value       *d_values_in,
         Value       *d_values_out,
-        int         current_bit)
+        int         current_bit,
+        int         num_bits)
     :
         temp_storage(temp_storage.Alias()),
         bin_offset(bin_offset),
@@ -648,6 +653,7 @@ struct BlockRangeRadixSortDownsweep
         d_values_in(d_values_in),
         d_values_out(d_values_out),
         current_bit(current_bit),
+        num_bits(num_bits),
         short_circuit(false)
     {}
 
@@ -663,14 +669,16 @@ struct BlockRangeRadixSortDownsweep
         Key         *d_keys_out,
         Value       *d_values_in,
         Value       *d_values_out,
-        int         current_bit)
+        int         current_bit,
+        int         num_bits)
     :
         temp_storage(temp_storage.Alias()),
         d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
         d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
         d_values_in(d_values_in),
         d_values_out(d_values_out),
-        current_bit(current_bit)
+        current_bit(current_bit),
+        num_bits(num_bits)
     {
         // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
         if (threadIdx.x < RADIX_DIGITS)

diff --git a/cub/block_range/block_range_radix_sort_upsweep.cuh b/cub/block_range/block_range_radix_sort_upsweep.cuh
@@ -171,6 +171,9 @@ struct BlockRangeRadixSortUpsweep
     // The least-significant bit position of the current digit to extract
     int             current_bit;
 
+    // Number of bits in current digit
+    int             num_bits;
+
 
 
     //---------------------------------------------------------------------
@@ -214,15 +217,17 @@ struct BlockRangeRadixSortUpsweep
         // Perform transform op
         UnsignedBits converted_key = Traits<Key>::TwiddleIn(key);
 
-        // Add in sub-counter offset
-        UnsignedBits sub_counter = BFE(converted_key, current_bit, LOG_PACKING_RATIO);
+        // Extract current digit bits
+        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
 
-        // Add in row offset
-        UnsignedBits row_offset = BFE(converted_key, current_bit + LOG_PACKING_RATIO, LOG_COUNTER_LANES);
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
 
         // Increment counter
         temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
-
     }
 
 
@@ -372,11 +377,13 @@ struct BlockRangeRadixSortUpsweep
     __device__ __forceinline__ BlockRangeRadixSortUpsweep(
         TempStorage &temp_storage,
         Key         *d_keys_in,
-        int         current_bit)
+        int         current_bit,
+        int         num_bits)
     :
         temp_storage(temp_storage.Alias()),
         d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        current_bit(current_bit)
+        current_bit(current_bit),
+        num_bits(num_bits)
     {}