From c19fef9a60e4db64d265a20a3480b3ba2a06e64e Mon Sep 17 00:00:00 2001 From: dumerrill Date: Mon, 12 May 2014 16:55:22 -0400 Subject: [PATCH] Update doc Former-commit-id: da157061d1d6cc02ffffdd5ffef73464c42a0c1f --- CHANGE_LOG.TXT | 34 ++++++++++++++++++++++++++++++++++ README.md | 3 ++- docs/Doxyfile | 2 +- docs/download_cub.html | 6 +++--- docs/mainpage.dox | 41 +++++++++++++++++++++++++++++++++++++++-- test/test_block_scan.cu | 3 +-- 6 files changed, 80 insertions(+), 9 deletions(-) diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT index 1c7e6ab15a..8c0b9a89ad 100644 --- a/CHANGE_LOG.TXT +++ b/CHANGE_LOG.TXT @@ -1,5 +1,39 @@ //----------------------------------------------------------------------------- +1.3.0 03/03/2014 + - New features: + - CUB's collective (block-wide, warp-wide) primitives underwent a minor + interface refactoring: + - To provide the appropriate support for multidimensional thread blocks, + The interfaces for collective classes are now template-parameterized + by X, Y, and Z block dimensions (with BLOCK_DIM_Y and BLOCK_DIM_Z being + optional, and BLOCK_DIM_X replacing BLOCK_THREADS). Furthermore, the + constructors that accept remapped linear thread-identifiers have been + removed: all primitives now assume a row-major thread-ranking for + multidimensional thread blocks. + - To allow the host program (compiled by the host-pass) to + accurately determine the device-specific storage requirements for + a given collective (compiled for each device-pass), the interfaces + for collective classes are now (optionally) template-parameterized + by the desired PTX compute capability. This is useful when + aliasing collective storage to shared memory that has been + allocated dynamically by the host at the kernel call site. + - Most CUB programs having typical 1D usage should not require any + changes to accomodate these updates. + - Bug fixes: + - Fixed bug in cub::WarpScan (which affected cub::BlockScan and + cub::DeviceScan) where incorrect results (e.g., NAN) would often be + returned when parameterized for floating-point types (fp32, fp64). + - Workaround-fix for ptxas error when compiling with with -G flag on Linux + (for debug instrumentation) + - Misc. workaround-fixes for certain scan scenarios (using custom + scan operators) where code compiled for SM1x is run on newer + GPUs of higher compute-capability: the compiler could not tell + which memory space was being used collective operations and was + mistakenly using global ops instead of shared ops. + +//----------------------------------------------------------------------------- + 1.2.3 03/03/2014 - Bug fixes: - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types diff --git a/README.md b/README.md index a1a60911e3..3eae36af7b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@

About CUB

-Current release: v1.2.3 (April 1, 2014) +Current release: v1.3.0 (May 12, 2014) We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples. @@ -84,6 +84,7 @@ See [CUB Project Website](http://nvlabs.github.com/cub) for more information. | Date | Version | | ---- | ------- | +| 05/12/2014 | [CUB v1.3.0 Primary Release](https://github.com/NVlabs/cub/archive/1.3.0.zip) | | 04/01/2014 | [CUB v1.2.3 Primary Release](https://github.com/NVlabs/cub/archive/1.2.3.zip) | | 12/10/2013 | [CUB v1.1.1 Primary Release](https://github.com/NVlabs/cub/archive/1.1.1.zip) | | 08/08/2013 | [CUB v1.0.1 Primary Release](https://github.com/NVlabs/cub/archive/1.0.1.zip) | diff --git a/docs/Doxyfile b/docs/Doxyfile index 09088d127d..d8ef943631 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -214,7 +214,7 @@ ALIASES += iterator="(may be a simple pointer type)" ALIASES += rowmajor="For multi-dimensional blocks, threads are linearly ranked in row-major order." ALIASES += blocked="Assumes a [blocked arrangement](index.html#sec4sec3) of (block-threads*items-per-thread) items across the thread block, where threadi owns the ith range of items-per-thread contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed." ALIASES += striped="Assumes a [striped arrangement](index.html#sec4sec3) of (block-threads*items-per-thread) items across the thread block, where threadi owns items (i), (i + block-threads), ..., (i + (block-threads*(items-per-thread-1))). For multi-dimensional thread blocks, a row-major thread ordering is assumed." -ALIASES += warpstriped="Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1))). +ALIASES += warpstriped="Assumes a warp-striped arrangement of elements across threads, where warpi owns the ith range of (warp-threads*items-per-thread) contiguous items, and each thread owns items (i), (i + warp-threads), ..., (i + (warp-threads*(items-per-thread-1)))." ALIASES += linear_performance{1}="The work-complexity of \1 as a function of input size is linear, resulting in performance throughput that plateaus with problem sizes large enough to saturate the GPU." ALIASES += plots_below="Performance plots for other scenarios can be found in the detailed method descriptions below." diff --git a/docs/download_cub.html b/docs/download_cub.html index b618fbcf48..bce34c6d19 100644 --- a/docs/download_cub.html +++ b/docs/download_cub.html @@ -37,14 +37,14 @@
If your download doesn't start in 3s:

- -Download CUB! + +Download CUB!
diff --git a/docs/mainpage.dox b/docs/mainpage.dox index 4e677b8c1f..2889bc0370 100644 --- a/docs/mainpage.dox +++ b/docs/mainpage.dox @@ -58,7 +58,7 @@    -Download CUB v1.2.3 +Download CUB v1.3.0 @@ -506,9 +506,46 @@ for many of the algorithmic techniques used by CUB. \par + +
+05/12/2014
+[CUB v1.3.0](download_cub.html) +
+ - New features: + - CUB's collective (block-wide, warp-wide) primitives underwent a minor + interface refactoring: + - To provide the appropriate support for multidimensional thread blocks, + The interfaces for collective classes are now template-parameterized + by X, Y, and Z block dimensions (with \p BLOCK_DIM_Y and \p BLOCK_DIM_Z being + optional, and \p BLOCK_DIM_X replacing \p BLOCK_THREADS). Furthermore, the + constructors that accept remapped linear thread-identifiers have been + removed: all primitives now assume a row-major thread-ranking for + multidimensional thread blocks. + - To allow the host program (compiled by the host-pass) to + accurately determine the device-specific storage requirements for + a given collective (compiled for each device-pass), the interfaces + for collective classes are now (optionally) template-parameterized + by the desired PTX compute capability. This is useful when + aliasing collective storage to shared memory that has been + allocated dynamically by the host at the kernel call site. + - Most CUB programs having typical 1D usage should not require any + changes to accomodate these updates. + - Bug fixes: + - Fixed bug in cub::WarpScan (which affected cub::BlockScan and + cub::DeviceScan) where incorrect results (e.g., NAN) would often be + returned when parameterized for floating-point types (fp32, fp64). + - Workaround-fix for ptxas error when compiling with with -G flag on Linux + (for debug instrumentation) + - Misc. workaround-fixes for certain scan scenarios (using custom + scan operators) where code compiled for SM1x is run on newer + GPUs of higher compute-capability: the compiler could not tell + which memory space was being used collective operations and was + mistakenly using global ops instead of shared ops. + - See the [change-log](CHANGE_LOG.TXT) for further details +
04/01/2014
-[CUB v1.2.3](download_cub.html) +CUB v1.2.3
- Bug fixes: - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types diff --git a/test/test_block_scan.cu b/test/test_block_scan.cu index 31f4d13c31..16b7bc6b4f 100644 --- a/test/test_block_scan.cu +++ b/test/test_block_scan.cu @@ -642,7 +642,7 @@ void Test( #if defined(_WIN32) || defined(_WIN64) // Accommodate ptxas crash bug (access violation) on Windows - static const bool special_skip = (TEST_ARCH <= 130) && (Equals::VALUE) && (ITEMS_PER_THREAD > 1); + static const bool special_skip = (TEST_ARCH <= 130) && (Equals::VALUE) && (BLOCK_DIM_Z > 1); #else static const bool special_skip = false; #endif @@ -795,7 +795,6 @@ void Test() // complex Test(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85), CUB_TYPE_STRING(Sum)); Test(Sum(), TestBar(0, 0), TestBar(17, 21), CUB_TYPE_STRING(Sum)); - }