diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index 2dec19f58c..25844faca3 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -706,7 +706,7 @@ struct DispatchReduce : cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. { - typedef DispatchReduce::MaxPolicy MaxPolicyT; + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; cudaError error = cudaSuccess; do diff --git a/cub/thread/thread_store.cuh b/cub/thread/thread_store.cuh index e9d7b54aa2..f5f6c92484 100644 --- a/cub/thread/thread_store.cuh +++ b/cub/thread/thread_store.cuh @@ -342,14 +342,18 @@ __device__ __forceinline__ void ThreadStoreVolatilePtr( #else - typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying + // Create a temporary using shuffle-words, then store using volatile-words + typedef typename UnitWord::VolatileWord VolatileWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); - + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + VolatileWord words[VOLATILE_MULTIPLE]; - *reinterpret_cast(words) = val; -// VolatileWord *words = reinterpret_cast(&val); + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( reinterpret_cast(ptr), @@ -384,13 +388,18 @@ __device__ __forceinline__ void ThreadStore( Int2Type modifier, Int2Type is_pointer) { - typedef typename UnitWord::DeviceWord DeviceWord; // Word type for memcopying - - const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + // Create a temporary using shuffle-words, then store using device-words + typedef typename UnitWord::DeviceWord DeviceWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + DeviceWord words[DEVICE_MULTIPLE]; - *reinterpret_cast(words) = val; + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( reinterpret_cast(ptr), diff --git a/test/test_block_reduce.cu b/test/test_block_reduce.cu index 983c90ce59..688d62925a 100644 --- a/test/test_block_reduce.cu +++ b/test/test_block_reduce.cu @@ -36,6 +36,7 @@ #include #include +#include #include #include diff --git a/test/test_block_scan.cu b/test/test_block_scan.cu index 5df36838c2..cbe31b8726 100644 --- a/test/test_block_scan.cu +++ b/test/test_block_scan.cu @@ -36,6 +36,7 @@ #include #include #include +#include #include #include diff --git a/test/test_device_histogram.cu b/test/test_device_histogram.cu index 31e4a8af2d..a97bcc1c5c 100644 --- a/test/test_device_histogram.cu +++ b/test/test_device_histogram.cu @@ -36,6 +36,7 @@ #include #include #include +#include #include diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu index 1b1366818b..9876916fb5 100644 --- a/test/test_device_radix_sort.cu +++ b/test/test_device_radix_sort.cu @@ -35,6 +35,7 @@ #include #include +#include #include #include diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu index 49addd9c1e..89a4a35f1a 100644 --- a/test/test_device_reduce.cu +++ b/test/test_device_reduce.cu @@ -1131,9 +1131,11 @@ void TestType( int max_items, int max_segments) { - DeviceReducePolicy::MaxPolicy::Invoke( - g_ptx_version, - TestBySize(max_items, max_segments)); + typedef typename DeviceReducePolicy::MaxPolicy MaxPolicyT; + + TestBySize dispatch(max_items, max_segments); + + MaxPolicyT::Invoke(g_ptx_version, dispatch); } diff --git a/test/test_device_reduce_by_key.cu b/test/test_device_reduce_by_key.cu index 8165158470..84ff8f67c6 100644 --- a/test/test_device_reduce_by_key.cu +++ b/test/test_device_reduce_by_key.cu @@ -34,6 +34,7 @@ #define CUB_STDERR #include +#include #include #include diff --git a/test/test_device_run_length_encode.cu b/test/test_device_run_length_encode.cu index 4a134833e9..169282295a 100644 --- a/test/test_device_run_length_encode.cu +++ b/test/test_device_run_length_encode.cu @@ -34,6 +34,7 @@ #define CUB_STDERR #include +#include #include #include diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu index f681846b34..b58e6a12c9 100644 --- a/test/test_device_scan.cu +++ b/test/test_device_scan.cu @@ -34,6 +34,7 @@ #define CUB_STDERR #include +#include #include #include diff --git a/test/test_device_select_if.cu b/test/test_device_select_if.cu index 8952585cc5..45b4883430 100644 --- a/test/test_device_select_if.cu +++ b/test/test_device_select_if.cu @@ -34,6 +34,7 @@ #define CUB_STDERR #include +#include #include #include diff --git a/test/test_device_select_unique.cu b/test/test_device_select_unique.cu index 0b888575b2..847d47987d 100644 --- a/test/test_device_select_unique.cu +++ b/test/test_device_select_unique.cu @@ -34,6 +34,7 @@ #define CUB_STDERR #include +#include #include #include diff --git a/test/test_iterator.cu b/test/test_iterator.cu index 2b36e3c2ca..8a66d3b285 100644 --- a/test/test_iterator.cu +++ b/test/test_iterator.cu @@ -35,6 +35,7 @@ #include #include +#include #include #include diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu index 58c09d0d4d..a8597e5d20 100644 --- a/test/test_warp_reduce.cu +++ b/test/test_warp_reduce.cu @@ -34,6 +34,7 @@ #define CUB_STDERR #include +#include #include #include diff --git a/test/test_warp_scan.cu b/test/test_warp_scan.cu index 4821d8b702..916b8aff87 100644 --- a/test/test_warp_scan.cu +++ b/test/test_warp_scan.cu @@ -34,6 +34,7 @@ #define CUB_STDERR #include +#include #include #include @@ -348,6 +349,13 @@ void Test( T *p_prefix = NULL; T aggregate = Initialize(gen_mode, h_in, h_reference, LOGICAL_WARP_THREADS, scan_op, identity, p_prefix); + if (g_verbose) + { + printf("Input: \n"); + DisplayResults(h_in, LOGICAL_WARP_THREADS); + printf("\n"); + } + for (int i = 0; i < LOGICAL_WARP_THREADS; ++i) { h_aggregate[i] = aggregate; @@ -367,9 +375,9 @@ void Test( CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * LOGICAL_WARP_THREADS)); // Run kernel - printf("Test-mode %d, gen-mode %d, %s warpscan, %d warp threads, %s (%d bytes) elements:\n", - TEST_MODE, - gen_mode, + printf("Test-mode %d (%s), gen-mode %d (%s), %s warpscan, %d warp threads, %s (%d bytes) elements:\n", + TEST_MODE, typeid(TEST_MODE).name(), + gen_mode, typeid(gen_mode).name(), (Equals::VALUE) ? "Inclusive" : "Exclusive", LOGICAL_WARP_THREADS, typeid(T).name(), @@ -455,7 +463,6 @@ void Test(GenMode gen_mode) int ptx_version; CubDebugExit(PtxVersion(ptx_version)); - // primitive Test(gen_mode, Sum(), (char) 0, (char) 99); Test(gen_mode, Sum(), (short) 0, (short) 99);