diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
index 2dec19f58c..25844faca3 100644
--- a/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/device/dispatch/dispatch_reduce.cuh
@@ -706,7 +706,7 @@ struct DispatchReduce :
cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0.
bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
{
- typedef DispatchReduce::MaxPolicy MaxPolicyT;
+ typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
cudaError error = cudaSuccess;
do
diff --git a/cub/thread/thread_store.cuh b/cub/thread/thread_store.cuh
index e9d7b54aa2..f5f6c92484 100644
--- a/cub/thread/thread_store.cuh
+++ b/cub/thread/thread_store.cuh
@@ -342,14 +342,18 @@ __device__ __forceinline__ void ThreadStoreVolatilePtr(
#else
- typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying
+ // Create a temporary using shuffle-words, then store using volatile-words
+ typedef typename UnitWord::VolatileWord VolatileWord;
+ typedef typename UnitWord::ShuffleWord ShuffleWord;
const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
-
+ const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
+
VolatileWord words[VOLATILE_MULTIPLE];
- *reinterpret_cast(words) = val;
-// VolatileWord *words = reinterpret_cast(&val);
+ #pragma unroll
+ for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+ reinterpret_cast(words)[i] = reinterpret_cast(&val)[i];
IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
reinterpret_cast(ptr),
@@ -384,13 +388,18 @@ __device__ __forceinline__ void ThreadStore(
Int2Type modifier,
Int2Type is_pointer)
{
- typedef typename UnitWord::DeviceWord DeviceWord; // Word type for memcopying
-
- const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+ // Create a temporary using shuffle-words, then store using device-words
+ typedef typename UnitWord::DeviceWord DeviceWord;
+ typedef typename UnitWord::ShuffleWord ShuffleWord;
+ const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+ const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
+
DeviceWord words[DEVICE_MULTIPLE];
- *reinterpret_cast(words) = val;
+ #pragma unroll
+ for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+ reinterpret_cast(words)[i] = reinterpret_cast(&val)[i];
IterateThreadStore<0, DEVICE_MULTIPLE>::template Store(
reinterpret_cast(ptr),
diff --git a/test/test_block_reduce.cu b/test/test_block_reduce.cu
index 983c90ce59..688d62925a 100644
--- a/test/test_block_reduce.cu
+++ b/test/test_block_reduce.cu
@@ -36,6 +36,7 @@
#include
#include
+#include
#include
#include
diff --git a/test/test_block_scan.cu b/test/test_block_scan.cu
index 5df36838c2..cbe31b8726 100644
--- a/test/test_block_scan.cu
+++ b/test/test_block_scan.cu
@@ -36,6 +36,7 @@
#include
#include
#include
+#include
#include
#include
diff --git a/test/test_device_histogram.cu b/test/test_device_histogram.cu
index 31e4a8af2d..a97bcc1c5c 100644
--- a/test/test_device_histogram.cu
+++ b/test/test_device_histogram.cu
@@ -36,6 +36,7 @@
#include
#include
#include
+#include
#include
diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu
index 1b1366818b..9876916fb5 100644
--- a/test/test_device_radix_sort.cu
+++ b/test/test_device_radix_sort.cu
@@ -35,6 +35,7 @@
#include
#include
+#include
#include
#include
diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
index 49addd9c1e..89a4a35f1a 100644
--- a/test/test_device_reduce.cu
+++ b/test/test_device_reduce.cu
@@ -1131,9 +1131,11 @@ void TestType(
int max_items,
int max_segments)
{
- DeviceReducePolicy::MaxPolicy::Invoke(
- g_ptx_version,
- TestBySize(max_items, max_segments));
+ typedef typename DeviceReducePolicy::MaxPolicy MaxPolicyT;
+
+ TestBySize dispatch(max_items, max_segments);
+
+ MaxPolicyT::Invoke(g_ptx_version, dispatch);
}
diff --git a/test/test_device_reduce_by_key.cu b/test/test_device_reduce_by_key.cu
index 8165158470..84ff8f67c6 100644
--- a/test/test_device_reduce_by_key.cu
+++ b/test/test_device_reduce_by_key.cu
@@ -34,6 +34,7 @@
#define CUB_STDERR
#include
+#include
#include
#include
diff --git a/test/test_device_run_length_encode.cu b/test/test_device_run_length_encode.cu
index 4a134833e9..169282295a 100644
--- a/test/test_device_run_length_encode.cu
+++ b/test/test_device_run_length_encode.cu
@@ -34,6 +34,7 @@
#define CUB_STDERR
#include
+#include
#include
#include
diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu
index f681846b34..b58e6a12c9 100644
--- a/test/test_device_scan.cu
+++ b/test/test_device_scan.cu
@@ -34,6 +34,7 @@
#define CUB_STDERR
#include
+#include
#include
#include
diff --git a/test/test_device_select_if.cu b/test/test_device_select_if.cu
index 8952585cc5..45b4883430 100644
--- a/test/test_device_select_if.cu
+++ b/test/test_device_select_if.cu
@@ -34,6 +34,7 @@
#define CUB_STDERR
#include
+#include
#include
#include
diff --git a/test/test_device_select_unique.cu b/test/test_device_select_unique.cu
index 0b888575b2..847d47987d 100644
--- a/test/test_device_select_unique.cu
+++ b/test/test_device_select_unique.cu
@@ -34,6 +34,7 @@
#define CUB_STDERR
#include
+#include
#include
#include
diff --git a/test/test_iterator.cu b/test/test_iterator.cu
index 2b36e3c2ca..8a66d3b285 100644
--- a/test/test_iterator.cu
+++ b/test/test_iterator.cu
@@ -35,6 +35,7 @@
#include
#include
+#include
#include
#include
diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu
index 58c09d0d4d..a8597e5d20 100644
--- a/test/test_warp_reduce.cu
+++ b/test/test_warp_reduce.cu
@@ -34,6 +34,7 @@
#define CUB_STDERR
#include
+#include
#include
#include
diff --git a/test/test_warp_scan.cu b/test/test_warp_scan.cu
index 4821d8b702..916b8aff87 100644
--- a/test/test_warp_scan.cu
+++ b/test/test_warp_scan.cu
@@ -34,6 +34,7 @@
#define CUB_STDERR
#include
+#include
#include
#include
@@ -348,6 +349,13 @@ void Test(
T *p_prefix = NULL;
T aggregate = Initialize(gen_mode, h_in, h_reference, LOGICAL_WARP_THREADS, scan_op, identity, p_prefix);
+ if (g_verbose)
+ {
+ printf("Input: \n");
+ DisplayResults(h_in, LOGICAL_WARP_THREADS);
+ printf("\n");
+ }
+
for (int i = 0; i < LOGICAL_WARP_THREADS; ++i)
{
h_aggregate[i] = aggregate;
@@ -367,9 +375,9 @@ void Test(
CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * LOGICAL_WARP_THREADS));
// Run kernel
- printf("Test-mode %d, gen-mode %d, %s warpscan, %d warp threads, %s (%d bytes) elements:\n",
- TEST_MODE,
- gen_mode,
+ printf("Test-mode %d (%s), gen-mode %d (%s), %s warpscan, %d warp threads, %s (%d bytes) elements:\n",
+ TEST_MODE, typeid(TEST_MODE).name(),
+ gen_mode, typeid(gen_mode).name(),
(Equals::VALUE) ? "Inclusive" : "Exclusive",
LOGICAL_WARP_THREADS,
typeid(T).name(),
@@ -455,7 +463,6 @@ void Test(GenMode gen_mode)
int ptx_version;
CubDebugExit(PtxVersion(ptx_version));
-
// primitive
Test(gen_mode, Sum(), (char) 0, (char) 99);
Test(gen_mode, Sum(), (short) 0, (short) 99);