Update doc

Former-commit-id: da15706
GaryShen2008 · May 12, 2014 · c19fef9 · c19fef9
1 parent 5a464a0
commit c19fef9
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 9 deletions.
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
@@ -1,5 +1,39 @@
 //-----------------------------------------------------------------------------
 
+1.3.0    03/03/2014
+    - New features:
+    	- CUB's collective (block-wide, warp-wide) primitives underwent a minor 
+    	  interface refactoring:
+    	    - To provide the appropriate support for multidimensional thread blocks,
+    	      The interfaces for collective classes are now template-parameterized 
+    	      by X, Y, and Z block dimensions (with BLOCK_DIM_Y and BLOCK_DIM_Z being 
+    	      optional, and BLOCK_DIM_X replacing BLOCK_THREADS).  Furthermore, the 
+    	      constructors that accept remapped linear thread-identifiers have been 
+    	      removed: all primitives now assume a row-major thread-ranking for 
+    	      multidimensional thread blocks.  
+    	    - To allow the host program (compiled by the host-pass) to 
+    	      accurately determine the device-specific storage requirements for 
+    	      a given collective (compiled for each device-pass), the interfaces 
+    	      for collective classes are now (optionally) template-parameterized 
+    	      by the desired PTX compute capability. This is useful when 
+    	      aliasing collective storage to shared memory that has been 
+    	      allocated dynamically by the host at the kernel call site.   
+    	    - Most CUB programs having typical 1D usage should not require any 
+    	      changes to accomodate these updates.
+    - Bug fixes: 
+        - Fixed bug in cub::WarpScan (which affected cub::BlockScan and 
+          cub::DeviceScan) where incorrect results (e.g., NAN) would often be 
+          returned when parameterized for floating-point types (fp32, fp64).
+        - Workaround-fix for ptxas error when compiling with with -G flag on Linux 
+          (for debug instrumentation) 
+        - Misc. workaround-fixes for certain scan scenarios (using custom 
+          scan operators) where code compiled for SM1x is run on newer 
+          GPUs of higher compute-capability: the compiler could not tell
+          which memory space was being used collective operations and was 
+          mistakenly using global ops instead of shared ops. 
+
+//-----------------------------------------------------------------------------
+
 1.2.3    03/03/2014
     - Bug fixes: 
         - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <hr>
 <h3>About CUB</h3>
 
-Current release: v1.2.3 (April 1, 2014)
+Current release: v1.3.0 (May 12, 2014)
 
 We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.
 
@@ -84,6 +84,7 @@ See [CUB Project Website](http://nvlabs.github.com/cub) for more information.
  
 | Date | Version |
 | ---- | ------- |
+| 05/12/2014 | [CUB v1.3.0 Primary Release](https://github.com/NVlabs/cub/archive/1.3.0.zip) |
 | 04/01/2014 | [CUB v1.2.3 Primary Release](https://github.com/NVlabs/cub/archive/1.2.3.zip) |
 | 12/10/2013 | [CUB v1.1.1 Primary Release](https://github.com/NVlabs/cub/archive/1.1.1.zip) |
 | 08/08/2013 | [CUB v1.0.1 Primary Release](https://github.com/NVlabs/cub/archive/1.0.1.zip) |

diff --git a/docs/Doxyfile b/docs/Doxyfile
@@ -214,7 +214,7 @@ ALIASES                += iterator="(may be a simple pointer type)"
 ALIASES                += rowmajor="For multi-dimensional blocks, threads are linearly ranked in row-major order."
 ALIASES                += blocked="Assumes a [<em>blocked arrangement</em>](index.html#sec4sec3) of (<em>block-threads</em>*<em>items-per-thread</em>) items across the thread block, where thread<sub><em>i</em></sub> owns the <em>i</em><sup>th</sup> range of <em>items-per-thread</em> contiguous items.  For multi-dimensional thread blocks, a row-major thread ordering is assumed."
 ALIASES                += striped="Assumes a [<em>striped arrangement</em>](index.html#sec4sec3) of (<em>block-threads</em>*<em>items-per-thread</em>) items across the thread block, where thread<sub><em>i</em></sub> owns items (<em>i</em>), (<em>i</em> + <em>block-threads</em>), ..., (<em>i</em> + (<em>block-threads</em>*(<em>items-per-thread</em>-1))).  For multi-dimensional thread blocks, a row-major thread ordering is assumed."
-ALIASES                += warpstriped="Assumes a <em>warp-striped arrangement</em> of elements across threads, where warp<sub><em>i</em></sub> owns the <em>i</em><sup>th</sup> range of (<em>warp-threads</em>*<em>items-per-thread</em>) contiguous items, and each thread owns items (<em>i</em>), (<em>i</em> + <em>warp-threads</em>), ..., (<em>i</em> + (<em>warp-threads</em>*(<em>items-per-thread</em>-1))).
+ALIASES                += warpstriped="Assumes a <em>warp-striped arrangement</em> of elements across threads, where warp<sub><em>i</em></sub> owns the <em>i</em><sup>th</sup> range of (<em>warp-threads</em>*<em>items-per-thread</em>) contiguous items, and each thread owns items (<em>i</em>), (<em>i</em> + <em>warp-threads</em>), ..., (<em>i</em> + (<em>warp-threads</em>*(<em>items-per-thread</em>-1)))."
 
 ALIASES                += linear_performance{1}="The work-complexity of \1 as a function of input size is linear, resulting in performance throughput that plateaus with problem sizes large enough to saturate the GPU."  
 ALIASES                += plots_below="Performance plots for other scenarios can be found in the detailed method descriptions below."

diff --git a/docs/download_cub.html b/docs/download_cub.html
@@ -37,14 +37,14 @@
 </head>
 
 <body 
-	onload="downloadURL('https://github.com/NVlabs/cub/archive/1.2.3.zip');" 
+	onload="downloadURL('https://github.com/NVlabs/cub/archive/1.3.0.zip');" 
 	style="color: rgb(102, 102, 102); font-family: Helvetica, arial, freesans, clean, sans-serif; font-size: 13px; font-style: normal; font-variant: normal; font-weight: 300; height: 18px;">
 
 <center>
 If your download doesn't start in 3s:
 <br><br>
-<a href="https://github.com/NVlabs/cub/archive/1.2.3.zip"><img src="download-icon.png" style="position:relative; bottom:-10px; border:0px;"/></a>
-<a href="https://github.com/NVlabs/cub/archive/1.2.3.zip"><em>Download CUB!</em></a>
+<a href="https://github.com/NVlabs/cub/archive/1.3.0.zip"><img src="download-icon.png" style="position:relative; bottom:-10px; border:0px;"/></a>
+<a href="https://github.com/NVlabs/cub/archive/1.3.0.zip"><em>Download CUB!</em></a>
 </center>
 
 </body>

diff --git a/docs/mainpage.dox b/docs/mainpage.dox
@@ -58,7 +58,7 @@
 
 <a href="download_cub.html"><img src="download-icon.png" style="position:relative; bottom:-10px; border:0px;"/></a>
 &nbsp;&nbsp;
-<a href="download_cub.html"><em><b>Download CUB v1.2.3</b></em></a>
+<a href="download_cub.html"><em><b>Download CUB v1.3.0</b></em></a>
 
 </td><td>
 
@@ -506,9 +506,46 @@ for many of the algorithmic techniques used by CUB.
 \par
 <table>
 
+<tr><td style="white-space: nowrap; vertical-align:text-top;">
+05/12/2014<br>
+[<b>CUB v1.3.0</b>](download_cub.html)
+</td><td style="vertical-align:text-top;">
+    - New features:
+    	- CUB's collective (block-wide, warp-wide) primitives underwent a minor 
+    	  interface refactoring:
+    	    - To provide the appropriate support for multidimensional thread blocks,
+    	      The interfaces for collective classes are now template-parameterized 
+    	      by X, Y, and Z block dimensions (with \p BLOCK_DIM_Y and \p BLOCK_DIM_Z being 
+    	      optional, and \p BLOCK_DIM_X replacing \p BLOCK_THREADS).  Furthermore, the 
+    	      constructors that accept remapped linear thread-identifiers have been 
+    	      removed: all primitives now assume a row-major thread-ranking for 
+    	      multidimensional thread blocks.  
+    	    - To allow the host program (compiled by the host-pass) to 
+    	      accurately determine the device-specific storage requirements for 
+    	      a given collective (compiled for each device-pass), the interfaces 
+    	      for collective classes are now (optionally) template-parameterized 
+    	      by the desired PTX compute capability. This is useful when 
+    	      aliasing collective storage to shared memory that has been 
+    	      allocated dynamically by the host at the kernel call site.   
+    	    - Most CUB programs having typical 1D usage should not require any 
+    	      changes to accomodate these updates.
+    - Bug fixes: 
+        - Fixed bug in cub::WarpScan (which affected cub::BlockScan and 
+          cub::DeviceScan) where incorrect results (e.g., NAN) would often be 
+          returned when parameterized for floating-point types (fp32, fp64).
+        - Workaround-fix for ptxas error when compiling with with -G flag on Linux 
+          (for debug instrumentation) 
+        - Misc. workaround-fixes for certain scan scenarios (using custom 
+          scan operators) where code compiled for SM1x is run on newer 
+          GPUs of higher compute-capability: the compiler could not tell
+          which memory space was being used collective operations and was 
+          mistakenly using global ops instead of shared ops. 
+  - See the [change-log](CHANGE_LOG.TXT) for further details
+</td></tr>
+
 <tr><td style="white-space: nowrap; vertical-align:text-top;">
 04/01/2014<br>
-[<b>CUB v1.2.3</b>](download_cub.html)
+<b>CUB v1.2.3</b>
 </td><td style="vertical-align:text-top;">
   - Bug fixes: 
       - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types

diff --git a/test/test_block_scan.cu b/test/test_block_scan.cu
@@ -642,7 +642,7 @@ void Test(
 
 #if defined(_WIN32) || defined(_WIN64)
     // Accommodate ptxas crash bug (access violation) on Windows
-    static const bool special_skip = (TEST_ARCH <= 130) && (Equals<T, TestBar>::VALUE) && (ITEMS_PER_THREAD > 1);
+    static const bool special_skip = (TEST_ARCH <= 130) && (Equals<T, TestBar>::VALUE) && (BLOCK_DIM_Z > 1);
 #else
     static const bool special_skip = false;
 #endif
@@ -795,7 +795,6 @@ void Test()
     // complex
     Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85), CUB_TYPE_STRING(Sum<TestFoo>));
     Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestBar(0, 0), TestBar(17, 21), CUB_TYPE_STRING(Sum<TestBar>));
-
 }