diff --git a/.cproject b/.cproject
index 18ffbc70f7..e76d1da67e 100644
--- a/.cproject
+++ b/.cproject
@@ -23,13 +23,19 @@
-
+
-
+
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
index 4afefc8f3b..43860e691e 100644
--- a/CHANGE_LOG.TXT
+++ b/CHANGE_LOG.TXT
@@ -1,6 +1,7 @@
-1.7.4 09/19/2017
+1.7.4 09/20/2017
- Bug fixes:
- - Issue #114: Can't pair non-trivially-constructible values in radix sort
+ - Issue #114: Can't pair non-trivially-constructible values in radix sort
+ - Issue #115: WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32
//-----------------------------------------------------------------------------
diff --git a/README.md b/README.md
index a3327076a9..c107d673d5 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
About CUB
-Current release: v1.7.4 (09/19/2017)
+Current release: v1.7.4 (09/20/2017)
We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.
diff --git a/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/warp/specializations/warp_reduce_shfl.cuh
index 550fa349fa..682a5bfedc 100644
--- a/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -112,9 +112,10 @@ struct WarpReduceShfl
// Thread fields
//---------------------------------------------------------------------
- int lane_id;
- int member_mask;
+ unsigned int lane_id;
+
+ unsigned int member_mask;
//---------------------------------------------------------------------
// Construction
@@ -126,9 +127,9 @@ struct WarpReduceShfl
:
lane_id(LaneId()),
- member_mask(IS_ARCH_WARP ?
- 0xffffffff :
- (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
+ member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
+ 0 : // arch-width subwarps need not be tiled within the arch-warp
+ ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
{}
@@ -470,13 +471,13 @@ struct WarpReduceShfl
int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
- // Get the last thread in the logical warp
- int first_warp_thread = 0;
- int last_warp_thread = LOGICAL_WARP_THREADS - 1;
+ // Get the lane of the first and last thread in the logical warp
+ int first_thread = 0;
+ int last_thread = LOGICAL_WARP_THREADS - 1;
if (!IS_ARCH_WARP)
{
- first_warp_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
- last_warp_thread |= lane_id;
+ first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
+ last_thread |= lane_id;
}
// Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32)
@@ -484,8 +485,8 @@ struct WarpReduceShfl
// Get the last valid lane
int last_lane = (ALL_LANES_VALID) ?
- last_warp_thread :
- CUB_MIN(last_warp_thread, first_warp_thread + lanes_with_valid_data);
+ last_thread :
+ CUB_MIN(last_thread, first_thread + lanes_with_valid_data);
T output = input;
@@ -516,6 +517,7 @@ struct WarpReduceShfl
// Get the start flags for each thread in the warp.
int warp_flags = WARP_BALLOT(flag, member_mask);
+ // Convert to tail-segmented
if (HEAD_SEGMENTED)
warp_flags >>= 1;
diff --git a/cub/warp/specializations/warp_reduce_smem.cuh b/cub/warp/specializations/warp_reduce_smem.cuh
index ab7e3dfde2..9ba8e94d12 100644
--- a/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/warp/specializations/warp_reduce_smem.cuh
@@ -113,12 +113,14 @@ struct WarpReduceSmem
TempStorage &temp_storage)
:
temp_storage(temp_storage.Alias()),
+
lane_id(IS_ARCH_WARP ?
LaneId() :
LaneId() % LOGICAL_WARP_THREADS),
- member_mask(!IS_POW_OF_TWO ?
- (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) : // non-power-of-two subwarps cannot be tiled
- (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
+
+ member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+ 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+ ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
{}
/******************************************************************************
diff --git a/cub/warp/specializations/warp_scan_shfl.cuh b/cub/warp/specializations/warp_scan_shfl.cuh
index 05faaf930f..f0deb8ddef 100644
--- a/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/warp/specializations/warp_scan_shfl.cuh
@@ -46,6 +46,8 @@ namespace cub {
/**
* \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
*/
template <
typename T, ///< Data type being scanned
@@ -98,12 +100,11 @@ struct WarpScanShfl
__device__ __forceinline__ WarpScanShfl(
TempStorage &/*temp_storage*/)
:
- lane_id(IS_ARCH_WARP ?
- LaneId() :
- LaneId() % LOGICAL_WARP_THREADS),
- member_mask(IS_ARCH_WARP ?
- 0xffffffff :
- (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
+ lane_id(LaneId()),
+
+ member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
+ 0 : // arch-width subwarps need not be tiled within the arch-warp
+ ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
{}
@@ -594,7 +595,12 @@ struct WarpScanShfl
{
inclusive = scan_op(initial_value, inclusive);
exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
- if (lane_id == 0)
+
+ unsigned int segment_id = (IS_ARCH_WARP) ?
+ lane_id :
+ lane_id % LOGICAL_WARP_THREADS;
+
+ if (segment_id == 0)
exclusive = initial_value;
}
diff --git a/cub/warp/specializations/warp_scan_smem.cuh b/cub/warp/specializations/warp_scan_smem.cuh
index 0a7c2bdaf6..c3a7a94ba2 100644
--- a/cub/warp/specializations/warp_scan_smem.cuh
+++ b/cub/warp/specializations/warp_scan_smem.cuh
@@ -104,12 +104,14 @@ struct WarpScanSmem
TempStorage &temp_storage)
:
temp_storage(temp_storage.Alias()),
+
lane_id(IS_ARCH_WARP ?
LaneId() :
LaneId() % LOGICAL_WARP_THREADS),
- member_mask(!IS_POW_OF_TWO ?
- (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) : // non-power-of-two subwarps cannot be tiled
- (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
+
+ member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+ 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+ ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
{}
diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu
index 7ff04ef8ba..130f20e3e8 100644
--- a/test/test_warp_reduce.cu
+++ b/test/test_warp_reduce.cu
@@ -778,7 +778,10 @@ template
void Test()
{
Test<1, LOGICAL_WARP_THREADS>();
- Test<2, LOGICAL_WARP_THREADS>();
+
+ // Only power-of-two subwarps can be tiled
+ if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo::VALUE)
+ Test<2, LOGICAL_WARP_THREADS>();
}