From f87bd49b66a303706c4b13e6003fad105ed3aa38 Mon Sep 17 00:00:00 2001 From: dumerrill Date: Mon, 14 Dec 2015 13:12:25 -0500 Subject: [PATCH] 1.5.0 doxgen added bool sorting test Former-commit-id: 9607db2517fced1c3bda71dac951007be0a02300 --- CHANGE_LOG.TXT | 15 + cub/block/block_load.cuh | 3 +- docs/download_cub.html | 6 +- docs/html/CHANGE_LOG.TXT | 15 + docs/html/annotated.html | 121 ++-- .../arg__index__input__iterator_8cuh.html | 4 +- ...g__index__input__iterator_8cuh_source.html | 46 +- docs/html/block__discontinuity_8cuh.html | 2 +- .../block__discontinuity_8cuh_source.html | 4 +- docs/html/block__exchange_8cuh.html | 2 +- ...__exchange_8cuh_source.html.REMOVED.git-id | 2 +- docs/html/block__histogram_8cuh.html | 2 +- docs/html/block__histogram_8cuh_source.html | 4 +- docs/html/block__load_8cuh.html | 2 +- ...lock__load_8cuh_source.html.REMOVED.git-id | 2 +- docs/html/block__radix__sort_8cuh.html | 2 +- docs/html/block__radix__sort_8cuh_source.html | 60 +- docs/html/block__reduce_8cuh.html | 2 +- docs/html/block__reduce_8cuh_source.html | 4 +- docs/html/block__scan_8cuh.html | 2 +- ...lock__scan_8cuh_source.html.REMOVED.git-id | 2 +- docs/html/block__store_8cuh.html | 2 +- docs/html/block__store_8cuh_source.html | 16 +- ...cache__modified__input__iterator_8cuh.html | 2 +- ...modified__input__iterator_8cuh_source.html | 2 +- ...ache__modified__output__iterator_8cuh.html | 2 +- ...odified__output__iterator_8cuh_source.html | 2 +- ..._1_1_arg_index_input_iterator-members.html | 37 +- ...classcub_1_1_arg_index_input_iterator.html | 25 +- ...sscub_1_1_block_discontinuity-members.html | 2 +- .../classcub_1_1_block_discontinuity.html | 2 +- .../classcub_1_1_block_exchange-members.html | 2 +- docs/html/classcub_1_1_block_exchange.html | 2 +- .../classcub_1_1_block_histogram-members.html | 2 +- docs/html/classcub_1_1_block_histogram.html | 2 +- .../html/classcub_1_1_block_load-members.html | 2 +- docs/html/classcub_1_1_block_load.html | 12 +- ...classcub_1_1_block_radix_sort-members.html | 2 +- docs/html/classcub_1_1_block_radix_sort.html | 4 +- .../classcub_1_1_block_reduce-members.html | 2 +- docs/html/classcub_1_1_block_reduce.html | 14 +- .../html/classcub_1_1_block_scan-members.html | 2 +- ...lasscub_1_1_block_scan.html.REMOVED.git-id | 2 +- .../classcub_1_1_block_store-members.html | 2 +- docs/html/classcub_1_1_block_store.html | 2 +- ...cache_modified_input_iterator-members.html | 2 +- ...cub_1_1_cache_modified_input_iterator.html | 2 +- ...ache_modified_output_iterator-members.html | 2 +- ...ub_1_1_cache_modified_output_iterator.html | 2 +- ...b_1_1_constant_input_iterator-members.html | 2 +- .../classcub_1_1_constant_input_iterator.html | 2 +- ...b_1_1_counting_input_iterator-members.html | 2 +- .../classcub_1_1_counting_input_iterator.html | 2 +- .../classcub_1_1_swizzle_scan_op-members.html | 2 +- docs/html/classcub_1_1_swizzle_scan_op.html | 2 +- ...ub_1_1_tex_obj_input_iterator-members.html | 2 +- .../classcub_1_1_tex_obj_input_iterator.html | 2 +- ...ub_1_1_tex_ref_input_iterator-members.html | 2 +- .../classcub_1_1_tex_ref_input_iterator.html | 2 +- ..._1_1_transform_input_iterator-members.html | 2 +- ...classcub_1_1_transform_input_iterator.html | 2 +- .../classcub_1_1_warp_reduce-members.html | 2 +- docs/html/classcub_1_1_warp_reduce.html | 2 +- docs/html/classcub_1_1_warp_scan-members.html | 2 +- ...classcub_1_1_warp_scan.html.REMOVED.git-id | 2 +- docs/html/classes.html | 69 +- docs/html/constant__input__iterator_8cuh.html | 2 +- ...constant__input__iterator_8cuh_source.html | 2 +- docs/html/counting__input__iterator_8cuh.html | 2 +- ...counting__input__iterator_8cuh_source.html | 2 +- docs/html/cub_8cuh.html | 4 +- docs/html/cub_8cuh_source.html | 88 +-- docs/html/device__histogram_8cuh.html | 2 +- docs/html/device__histogram_8cuh_source.html | 638 +++++++++--------- docs/html/device__partition_8cuh.html | 2 +- docs/html/device__partition_8cuh_source.html | 152 ++--- docs/html/device__radix__sort_8cuh.html | 2 +- .../html/device__radix__sort_8cuh_source.html | 554 +++++++-------- docs/html/device__reduce_8cuh.html | 3 +- docs/html/device__reduce_8cuh_source.html | 488 +++++++------- .../device__run__length__encode_8cuh.html | 2 +- ...vice__run__length__encode_8cuh_source.html | 176 ++--- docs/html/device__scan_8cuh.html | 2 +- docs/html/device__scan_8cuh_source.html | 268 ++++---- .../device__segmented__radix__sort_8cuh.html | 2 +- ...e__segmented__radix__sort_8cuh_source.html | 633 +++++++++-------- docs/html/device__segmented__reduce_8cuh.html | 3 +- ...device__segmented__reduce_8cuh_source.html | 461 ++++++------- docs/html/device__select_8cuh.html | 4 +- docs/html/device__select_8cuh_source.html | 224 +++--- docs/html/device__spmv_8cuh.html | 2 +- docs/html/device__spmv_8cuh_source.html | 2 +- .../dir_011e1c944d88f71be72e1e24a5fda7cf.html | 2 +- .../dir_18fc672d63781b5a743137aee24ff656.html | 4 +- .../dir_80932b4cec52750ff92b1a1912314cf5.html | 2 +- .../dir_bb50a5ef59f19d030d06415663184d05.html | 2 +- .../dir_cb3a671affffe7eeb3fdf5ae58e42cc8.html | 2 +- .../dir_d583f216f1aafe19404e836b0c097ad2.html | 2 +- docs/html/download_cub.html | 6 +- .../example_block_radix_sort_8cu-example.html | 2 +- .../example_block_reduce_8cu-example.html | 2 +- docs/html/example_block_scan_8cu-example.html | 2 +- .../example_device_histogram_8cu-example.html | 2 +- ..._device_partition_flagged_8cu-example.html | 2 +- ...ample_device_partition_if_8cu-example.html | 2 +- ...example_device_radix_sort_8cu-example.html | 36 +- .../example_device_reduce_8cu-example.html | 6 +- .../html/example_device_scan_8cu-example.html | 2 +- ...ple_device_select_flagged_8cu-example.html | 2 +- .../example_device_select_if_8cu-example.html | 2 +- ...mple_device_select_unique_8cu-example.html | 2 +- docs/html/examples.html | 2 +- docs/html/files.html | 35 +- docs/html/functions.html | 14 +- docs/html/functions_0x62.html | 3 +- docs/html/functions_0x63.html | 6 +- docs/html/functions_0x64.html | 18 +- docs/html/functions_0x65.html | 3 +- docs/html/functions_0x66.html | 3 +- docs/html/functions_0x68.html | 3 +- docs/html/functions_0x69.html | 3 +- docs/html/functions_0x6c.html | 3 +- docs/html/functions_0x6d.html | 9 +- docs/html/functions_0x6e.html | 6 +- docs/html/functions_0x6f.html | 80 ++- docs/html/functions_0x70.html | 3 +- docs/html/functions_0x72.html | 15 +- docs/html/functions_0x73.html | 35 +- docs/html/functions_0x74.html | 3 +- docs/html/functions_0x75.html | 3 +- docs/html/functions_0x76.html | 15 +- docs/html/functions_0x77.html | 3 +- docs/html/functions_0x7e.html | 3 +- docs/html/functions_func.html | 10 +- docs/html/functions_func_0x62.html | 2 +- docs/html/functions_func_0x63.html | 5 +- docs/html/functions_func_0x64.html | 7 +- docs/html/functions_func_0x65.html | 2 +- docs/html/functions_func_0x66.html | 2 +- docs/html/functions_func_0x68.html | 2 +- docs/html/functions_func_0x69.html | 2 +- docs/html/functions_func_0x6c.html | 2 +- docs/html/functions_func_0x6d.html | 8 +- docs/html/functions_func_0x6e.html | 5 +- docs/html/functions_func_0x6f.html | 80 +-- docs/html/functions_func_0x72.html | 10 +- docs/html/functions_func_0x73.html | 17 +- docs/html/functions_func_0x74.html | 2 +- docs/html/functions_func_0x75.html | 2 +- docs/html/functions_func_0x77.html | 2 +- docs/html/functions_func_0x7e.html | 2 +- docs/html/functions_rela.html | 2 +- docs/html/functions_type.html | 24 +- docs/html/functions_vars.html | 20 +- docs/html/globals.html | 8 +- docs/html/globals_defs.html | 8 +- docs/html/group___block_module.html | 2 +- docs/html/group___collective_module.html | 2 +- docs/html/group___device_module.html | 2 +- docs/html/group___segmented_module.html | 10 +- docs/html/group___single_module.html | 2 +- docs/html/group___util_io.html | 20 +- docs/html/group___util_iterator.html | 4 +- docs/html/group___util_mgmt.html | 12 +- docs/html/group___util_module.html | 20 +- docs/html/group___util_ptx.html | 2 +- docs/html/group___warp_module.html | 2 +- docs/html/hierarchy.html | 105 ++- docs/html/index.html | 12 +- docs/html/modules.html | 2 +- docs/html/namespacecub.html.REMOVED.git-id | 2 +- docs/html/namespacemembers.html | 2 +- docs/html/namespacemembers_enum.html | 2 +- docs/html/namespacemembers_eval.html | 2 +- docs/html/namespacemembers_func.html | 2 +- docs/html/namespaces.html | 2 +- docs/html/search/all_61.js | 9 +- docs/html/search/all_63.js | 7 +- docs/html/search/all_64.js | 6 +- docs/html/search/all_69.js | 1 - docs/html/search/all_6d.js | 4 +- docs/html/search/all_6e.js | 2 +- docs/html/search/all_6f.js | 2 +- docs/html/search/all_72.js | 4 +- docs/html/search/all_73.js | 10 +- docs/html/search/all_74.js | 18 +- docs/html/search/all_75.js | 2 - docs/html/search/all_76.js | 1 - docs/html/search/classes_63.js | 3 +- docs/html/search/classes_64.js | 4 +- docs/html/search/classes_69.js | 1 - docs/html/search/classes_74.js | 20 +- docs/html/search/files_64.js | 1 + docs/html/search/functions_61.js | 6 +- docs/html/search/functions_63.js | 3 +- docs/html/search/functions_64.js | 3 +- docs/html/search/functions_6d.js | 4 +- docs/html/search/functions_6e.js | 3 +- docs/html/search/functions_6f.js | 2 +- docs/html/search/functions_72.js | 4 +- docs/html/search/functions_73.js | 4 +- docs/html/search/search.js | 8 +- docs/html/search/typedefs_64.js | 1 - docs/html/search/typedefs_76.js | 1 - docs/html/structcub_1_1_arg_max-members.html | 2 +- docs/html/structcub_1_1_arg_max.html | 6 +- docs/html/structcub_1_1_arg_min-members.html | 2 +- docs/html/structcub_1_1_arg_min.html | 6 +- ..._block_discontinuity_1_1_temp_storage.html | 28 +- ...1_block_discontinuity_1_1_temp_storage.png | Bin 1495 -> 1458 bytes ...b_1_1_block_exchange_1_1_temp_storage.html | 28 +- ...ub_1_1_block_exchange_1_1_temp_storage.png | Bin 1962 -> 1931 bytes ..._1_1_block_histogram_1_1_temp_storage.html | 28 +- ...b_1_1_block_histogram_1_1_temp_storage.png | Bin 1956 -> 1926 bytes ...00_01_d_u_m_m_y_01_4_1_1_temp_storage.html | 30 +- ..._00_01_d_u_m_m_y_01_4_1_1_temp_storage.png | Bin 2382 -> 2345 bytes ..._o_s_402c3164d23f1ec647db5dad06a54584.html | 30 +- ...p_o_s_402c3164d23f1ec647db5dad06a54584.png | Bin 2406 -> 2375 bytes ..._o_s_e4c36dfe8f549604998f6c46cc8fbd1d.html | 30 +- ...p_o_s_e4c36dfe8f549604998f6c46cc8fbd1d.png | Bin 2520 -> 2487 bytes ...ctcub_1_1_block_load_1_1_temp_storage.html | 30 +- ...uctcub_1_1_block_load_1_1_temp_storage.png | Bin 1919 -> 1890 bytes ...1_1_block_radix_sort_1_1_temp_storage.html | 28 +- ..._1_1_block_radix_sort_1_1_temp_storage.png | Bin 2567 -> 2538 bytes ...cub_1_1_block_reduce_1_1_temp_storage.html | 28 +- ...tcub_1_1_block_reduce_1_1_temp_storage.png | Bin 1576 -> 1538 bytes ...ctcub_1_1_block_scan_1_1_temp_storage.html | 28 +- ...uctcub_1_1_block_scan_1_1_temp_storage.png | Bin 1563 -> 1531 bytes ..._00_09dfae03f13932c7dbdb41be30a5767ba.html | 28 +- ...e_00_09dfae03f13932c7dbdb41be30a5767ba.png | Bin 2399 -> 2361 bytes ..._s_p_263becc1ca5b47586740c2f7bb0d0145.html | 28 +- ...n_s_p_263becc1ca5b47586740c2f7bb0d0145.png | Bin 2595 -> 2561 bytes ..._s_p_8d170856b7ed1df0ed565731a681b449.html | 28 +- ...n_s_p_8d170856b7ed1df0ed565731a681b449.png | Bin 2439 -> 2421 bytes ...tcub_1_1_block_store_1_1_temp_storage.html | 28 +- ...ctcub_1_1_block_store_1_1_temp_storage.png | Bin 1929 -> 1899 bytes ..._1_1_caching_device_allocator-members.html | 2 +- ...tructcub_1_1_caching_device_allocator.html | 2 +- docs/html/structcub_1_1_cast-members.html | 2 +- docs/html/structcub_1_1_cast.html | 2 +- ...tructcub_1_1_device_histogram-members.html | 2 +- ...b_1_1_device_histogram.html.REMOVED.git-id | 2 +- ...tructcub_1_1_device_partition-members.html | 2 +- docs/html/structcub_1_1_device_partition.html | 14 +- ...ructcub_1_1_device_radix_sort-members.html | 2 +- .../html/structcub_1_1_device_radix_sort.html | 94 ++- .../structcub_1_1_device_reduce-members.html | 6 +- docs/html/structcub_1_1_device_reduce.html | 190 +++--- ..._1_1_device_run_length_encode-members.html | 2 +- ...tructcub_1_1_device_run_length_encode.html | 14 +- .../structcub_1_1_device_scan-members.html | 2 +- docs/html/structcub_1_1_device_scan.html | 31 +- ...1_device_segmented_radix_sort-members.html | 4 +- ...e_segmented_radix_sort.html.REMOVED.git-id | 2 +- ...b_1_1_device_segmented_reduce-members.html | 14 +- ...structcub_1_1_device_segmented_reduce.html | 270 ++++---- .../structcub_1_1_device_select-members.html | 2 +- docs/html/structcub_1_1_device_select.html | 19 +- .../structcub_1_1_device_spmv-members.html | 2 +- docs/html/structcub_1_1_device_spmv.html | 4 +- docs/html/structcub_1_1_equality-members.html | 2 +- docs/html/structcub_1_1_equality.html | 2 +- docs/html/structcub_1_1_equals-members.html | 2 +- docs/html/structcub_1_1_equals.html | 4 +- docs/html/structcub_1_1_if-members.html | 2 +- docs/html/structcub_1_1_if.html | 4 +- .../structcub_1_1_inequality-members.html | 2 +- docs/html/structcub_1_1_inequality.html | 2 +- ...uctcub_1_1_inequality_wrapper-members.html | 2 +- .../structcub_1_1_inequality_wrapper.html | 2 +- .../structcub_1_1_is_pointer-members.html | 2 +- docs/html/structcub_1_1_is_pointer.html | 8 +- .../structcub_1_1_is_volatile-members.html | 2 +- docs/html/structcub_1_1_is_volatile.html | 8 +- docs/html/structcub_1_1_log2-members.html | 2 +- docs/html/structcub_1_1_log2.html | 10 +- docs/html/structcub_1_1_max-members.html | 2 +- docs/html/structcub_1_1_max.html | 2 +- docs/html/structcub_1_1_min-members.html | 2 +- docs/html/structcub_1_1_min.html | 2 +- .../structcub_1_1_power_of_two-members.html | 2 +- docs/html/structcub_1_1_power_of_two.html | 8 +- ...tructcub_1_1_reduce_by_key_op-members.html | 2 +- docs/html/structcub_1_1_reduce_by_key_op.html | 2 +- ...tcub_1_1_reduce_by_segment_op-members.html | 2 +- .../structcub_1_1_reduce_by_segment_op.html | 8 +- ...ructcub_1_1_remove_qualifiers-members.html | 2 +- .../html/structcub_1_1_remove_qualifiers.html | 4 +- docs/html/structcub_1_1_sum-members.html | 2 +- docs/html/structcub_1_1_sum.html | 2 +- ...tcub_1_1_warp_reduce_1_1_temp_storage.html | 28 +- ...ctcub_1_1_warp_reduce_1_1_temp_storage.png | Bin 1310 -> 1280 bytes ...uctcub_1_1_warp_scan_1_1_temp_storage.html | 28 +- ...ructcub_1_1_warp_scan_1_1_temp_storage.png | Bin 1287 -> 1259 bytes docs/html/tex__obj__input__iterator_8cuh.html | 2 +- ...tex__obj__input__iterator_8cuh_source.html | 2 +- docs/html/tex__ref__input__iterator_8cuh.html | 2 +- ...tex__ref__input__iterator_8cuh_source.html | 2 +- docs/html/thread__load_8cuh.html | 2 +- docs/html/thread__load_8cuh_source.html | 38 +- docs/html/thread__operators_8cuh.html | 2 +- docs/html/thread__operators_8cuh_source.html | 18 +- docs/html/thread__store_8cuh.html | 2 +- docs/html/thread__store_8cuh_source.html | 137 ++-- .../html/transform__input__iterator_8cuh.html | 2 +- ...ransform__input__iterator_8cuh_source.html | 2 +- docs/html/util__allocator_8cuh_source.html | 14 +- docs/html/util__arch_8cuh.html | 2 +- docs/html/util__arch_8cuh_source.html | 2 +- docs/html/util__debug_8cuh.html | 10 +- docs/html/util__debug_8cuh_source.html | 8 +- docs/html/util__device_8cuh.html | 5 +- docs/html/util__device_8cuh_source.html | 448 ++++++------ docs/html/util__ptx_8cuh.html | 2 +- docs/html/util__ptx_8cuh_source.html | 4 +- docs/html/util__type_8cuh.html | 21 +- ...util__type_8cuh_source.html.REMOVED.git-id | 2 +- docs/html/warp__reduce_8cuh.html | 2 +- docs/html/warp__reduce_8cuh_source.html | 4 +- docs/html/warp__scan_8cuh.html | 2 +- docs/html/warp__scan_8cuh_source.html | 4 +- docs/mainpage.dox | 6 +- test/test_device_radix_sort.cu | 3 + test/test_util.h | 25 + 324 files changed, 3345 insertions(+), 3878 deletions(-) diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT index 202ec6f160..96973a5e90 100644 --- a/CHANGE_LOG.TXT +++ b/CHANGE_LOG.TXT @@ -1,5 +1,20 @@ //----------------------------------------------------------------------------- +1.5.0 12/14/2015 + - New Features: + - Added new segmented device-wide operations for device-wide sort and + reduction primitives. + - Bug fixes: + - Fix for Git Issue 36 (Compilation error with GCC 4.8.4 nvcc 7.0.27) and + Forums thread (ThreadLoad generates compiler errors when loading from + pointer-to-const) + - Fix for Git Issue 29 (DeviceRadixSort::SortKeys yields compiler + errors) + - Fix for Git Issue 26 (CUDA error: misaligned address after + cub::DeviceRadixSort::SortKeys()) + +//----------------------------------------------------------------------------- + 1.4.1 04/13/2015 - Bug fixes: - Fixes for CUDA 7.0 issues with SHFL-based warp-scan and warp-reduction diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh index 2df8cfebb5..4803576922 100644 --- a/cub/block/block_load.cuh +++ b/cub/block/block_load.cuh @@ -200,7 +200,8 @@ __device__ __forceinline__ void InternalLoadDirectBlockedVectorized( #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - items[ITEM] = reinterpret_cast(vec_items)[ITEM]; +// items[ITEM] = reinterpret_cast(vec_items)[ITEM]; + items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); } } diff --git a/docs/download_cub.html b/docs/download_cub.html index e900a57c53..a9f8ae9530 100644 --- a/docs/download_cub.html +++ b/docs/download_cub.html @@ -37,14 +37,14 @@
If your download doesn't start in 3s:

- -Download CUB! + +Download CUB!
diff --git a/docs/html/CHANGE_LOG.TXT b/docs/html/CHANGE_LOG.TXT index 202ec6f160..96973a5e90 100644 --- a/docs/html/CHANGE_LOG.TXT +++ b/docs/html/CHANGE_LOG.TXT @@ -1,5 +1,20 @@ //----------------------------------------------------------------------------- +1.5.0 12/14/2015 + - New Features: + - Added new segmented device-wide operations for device-wide sort and + reduction primitives. + - Bug fixes: + - Fix for Git Issue 36 (Compilation error with GCC 4.8.4 nvcc 7.0.27) and + Forums thread (ThreadLoad generates compiler errors when loading from + pointer-to-const) + - Fix for Git Issue 29 (DeviceRadixSort::SortKeys yields compiler + errors) + - Fix for Git Issue 26 (CUDA error: misaligned address after + cub::DeviceRadixSort::SortKeys()) + +//----------------------------------------------------------------------------- + 1.4.1 04/13/2015 - Bug fixes: - Fixes for CUDA 7.0 issues with SHFL-based warp-scan and warp-reduction diff --git a/docs/html/annotated.html b/docs/html/annotated.html index a31cdfc151..2398acf04e 100644 --- a/docs/html/annotated.html +++ b/docs/html/annotated.html @@ -103,145 +103,144 @@  oCCachingDeviceAllocatorA simple caching allocator for device memory allocations  oCIfType selection (IF ? ThenType : ElseType)  oCEqualsType equality test - oCNullTypeA simple "NULL" marker type - oCInt2TypeAllows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values) - oCCubVectorExposes a member typedef Type that names the corresponding CUDA vector type if one exists. Otherwise Type refers to the CubVector structure itself, which will wrap the corresponding x, y, etc. vector fields - oCUninitializedA storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions - oCKeyValuePairA key identifier paired with a corresponding value - oCDoubleBufferDouble-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth - oCLog2Statically determine log2(N), rounded up - oCPowerOfTwoStatically determine if N is a power-of-two - oCIsPointerPointer vs. iterator - oCIsVolatileVolatile modifier test - oCRemoveQualifiersRemoves const and volatile qualifiers from type Tp - oCArgIndexInputIteratorA random-access input wrapper for pairing dereferenced values with their corresponding indices (forming KeyValuePair tuples) - oCCacheModifiedInputIteratorA random-access input wrapper for dereferencing array values using a PTX cache load modifier - oCCacheModifiedOutputIteratorA random-access output wrapper for storing array values using a PTX cache-modifier - oCConstantInputIteratorA random-access input generator for dereferencing a sequence of homogeneous values - oCCountingInputIteratorA random-access input generator for dereferencing a sequence of incrementing integer values - oCTexObjInputIteratorA random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects - oCTexRefInputIteratorA random-access input wrapper for dereferencing array values through texture cache. Uses older Tesla/Fermi-style texture references - oCTransformInputIteratorA random-access input wrapper for transforming dereferenced values - oCEqualityDefault equality functor - oCInequalityDefault inequality functor - oCInequalityWrapperInequality functor (wraps equality functor) - oCSumDefault sum functor - oCMaxDefault max functor - oCArgMaxArg max functor (keeps the value and offset of the first occurrence of the larger item) - oCMinDefault min functor - oCArgMinArg min functor (keeps the value and offset of the first occurrence of the smallest item) - oCCastDefault cast functor - oCSwizzleScanOpBinary operator wrapper for switching non-commutative scan arguments - oCReduceBySegmentOpReduce-by-segment functor - oCReduceByKeyOp< Binary reduction operator to apply to values - oCBlockDiscontinuityThe BlockDiscontinuity class provides collective methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.

+ oCLog2Statically determine log2(N), rounded up + oCPowerOfTwoStatically determine if N is a power-of-two + oCIsPointerPointer vs. iterator + oCIsVolatileVolatile modifier test + oCRemoveQualifiersRemoves const and volatile qualifiers from type Tp + oCArgIndexInputIteratorA random-access input wrapper for pairing dereferenced values with their corresponding indices (forming KeyValuePair tuples) + oCCacheModifiedInputIteratorA random-access input wrapper for dereferencing array values using a PTX cache load modifier + oCCacheModifiedOutputIteratorA random-access output wrapper for storing array values using a PTX cache-modifier + oCConstantInputIteratorA random-access input generator for dereferencing a sequence of homogeneous values + oCCountingInputIteratorA random-access input generator for dereferencing a sequence of incrementing integer values + oCTexObjInputIteratorA random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects + oCTexRefInputIteratorA random-access input wrapper for dereferencing array values through texture cache. Uses older Tesla/Fermi-style texture references + oCTransformInputIteratorA random-access input wrapper for transforming dereferenced values + oCEqualityDefault equality functor + oCInequalityDefault inequality functor + oCInequalityWrapperInequality functor (wraps equality functor) + oCSumDefault sum functor + oCMaxDefault max functor + oCArgMaxArg max functor (keeps the value and offset of the first occurrence of the larger item) + oCMinDefault min functor + oCArgMinArg min functor (keeps the value and offset of the first occurrence of the smallest item) + oCCastDefault cast functor + oCSwizzleScanOpBinary operator wrapper for switching non-commutative scan arguments + oCReduceBySegmentOpReduce-by-segment functor + oCReduceByKeyOp< Binary reduction operator to apply to values + oCBlockDiscontinuityThe BlockDiscontinuity class provides collective methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.

discont_logo.png
- |\CTempStorageThe operations exposed by BlockDiscontinuity require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse - oCBlockExchangeThe BlockExchange class provides collective methods for rearranging data partitioned across a CUDA thread block.

+ |\CTempStorageThe operations exposed by BlockDiscontinuity require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse + oCBlockExchangeThe BlockExchange class provides collective methods for rearranging data partitioned across a CUDA thread block.

transpose_logo.png
- |\CTempStorageThe operations exposed by BlockExchange require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse - oCBlockHistogramThe BlockHistogram class provides collective methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.

+ |\CTempStorageThe operations exposed by BlockExchange require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse + oCBlockHistogramThe BlockHistogram class provides collective methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.

histogram_logo.png
- |\CTempStorageThe operations exposed by BlockHistogram require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse - oCBlockLoadThe BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block.

+ |\CTempStorageThe operations exposed by BlockHistogram require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse + oCBlockLoadThe BlockLoad class provides collective data movement methods for loading a linear segment of items from memory into a blocked arrangement across a CUDA thread block.

block_load_logo.png
- |\CTempStorageThe operations exposed by BlockLoad require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse - oCBlockRadixSortThe BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thread block using a radix sorting method.

+ |\CTempStorageThe operations exposed by BlockLoad require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse + oCBlockRadixSortThe BlockRadixSort class provides collective methods for sorting items partitioned across a CUDA thread block using a radix sorting method.

sorting_logo.png
- |\CTempStorageThe operations exposed by BlockScan require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse - oCBlockReduceThe BlockReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread block.

+ |\CTempStorageThe operations exposed by BlockScan require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse + oCBlockReduceThe BlockReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread block.

reduce_logo.png
- |\CTempStorageThe operations exposed by BlockReduce require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse - oCBlockScanThe BlockScan class provides collective methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.

+ |\CTempStorageThe operations exposed by BlockReduce require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse + oCBlockScanThe BlockScan class provides collective methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.

block_scan_logo.png
- |\CTempStorageThe operations exposed by BlockScan require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse - oCBlockStoreThe BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory.

+ |\CTempStorageThe operations exposed by BlockScan require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse + oCBlockStoreThe BlockStore class provides collective data movement methods for writing a blocked arrangement of items partitioned across a CUDA thread block to a linear segment of memory.

block_store_logo.png
- |\CTempStorageThe operations exposed by BlockStore require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse - oCDeviceHistogramDeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.

+ |\CTempStorageThe operations exposed by BlockStore require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse + oCDeviceHistogramDeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.

histogram_logo.png
- oCDevicePartitionDevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.

+ oCDevicePartitionDevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.

partition_logo.png
- oCDeviceRadixSortDeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.

+ oCDeviceRadixSortDeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.

sorting_logo.png
- oCDeviceReduceDeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.

+ oCDeviceReduceDeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.

reduce_logo.png
- oCDeviceRunLengthEncodeDeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory.

+ oCDeviceRunLengthEncodeDeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory.

run_length_encode_logo.png
- oCDeviceScanDeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.

+ oCDeviceScanDeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.

device_scan.png
- oCDeviceSegmentedRadixSortDeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.

+ oCDeviceSegmentedRadixSortDeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.

segmented_sorting_logo.png
- oCDeviceSelectDeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.

+ oCDeviceSegmentedReduceDeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory.

+
+reduce_logo.png +
+ + oCDeviceSelectDeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.

select_logo.png
- oCDeviceSpmvDeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV) - oCWarpScanThe WarpScan class provides collective methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.

+ oCDeviceSpmvDeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV) + oCWarpScanThe WarpScan class provides collective methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.

warp_scan_logo.png
- |\CTempStorageThe operations exposed by WarpScan require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse - \CWarpReduceThe WarpReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread warp.

+ |\CTempStorageThe operations exposed by WarpScan require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse + \CWarpReduceThe WarpReduce class provides collective methods for computing a parallel reduction of items partitioned across a CUDA thread warp.

warp_reduce_logo.png
-  \CTempStorageThe operations exposed by WarpReduce require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse +  \CTempStorageThe operations exposed by WarpReduce require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the __shared__ keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or union'd with other storage allocation types to facilitate memory reuse